atlasbot: improve worker readiness and metrics replies
This commit is contained in:
parent
be7846572f
commit
7bb1bd96fc
@ -16,7 +16,7 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
app: atlasbot
|
app: atlasbot
|
||||||
annotations:
|
annotations:
|
||||||
checksum/atlasbot-configmap: manual-atlasbot-13
|
checksum/atlasbot-configmap: manual-atlasbot-14
|
||||||
vault.hashicorp.com/agent-inject: "true"
|
vault.hashicorp.com/agent-inject: "true"
|
||||||
vault.hashicorp.com/role: "comms"
|
vault.hashicorp.com/role: "comms"
|
||||||
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
|
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
|
||||||
|
|||||||
@ -441,7 +441,7 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
|
|||||||
return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data."
|
return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data."
|
||||||
summary = "\n".join(rendered_parts)
|
summary = "\n".join(rendered_parts)
|
||||||
context = f"Metrics (from {dashboard} / {panel}):\n{summary}"
|
context = f"Metrics (from {dashboard} / {panel}):\n{summary}"
|
||||||
fallback = f"{panel}: {summary}"
|
fallback = _metrics_fallback_summary(panel, summary)
|
||||||
return context, fallback
|
return context, fallback
|
||||||
|
|
||||||
def jetson_nodes_from_kb() -> list[str]:
|
def jetson_nodes_from_kb() -> list[str]:
|
||||||
@ -654,6 +654,115 @@ def vm_render_result(res: dict | None, limit: int = 12) -> str:
|
|||||||
out.append(f"- {labels}: {val}")
|
out.append(f"- {labels}: {val}")
|
||||||
return "\n".join(out)
|
return "\n".join(out)
|
||||||
|
|
||||||
|
def _parse_metric_lines(summary: str) -> dict[str, str]:
|
||||||
|
parsed: dict[str, str] = {}
|
||||||
|
for line in (summary or "").splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line.startswith("-"):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
label, value = line.lstrip("-").split(":", 1)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
parsed[label.strip()] = value.strip()
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
def _metrics_fallback_summary(panel: str, summary: str) -> str:
|
||||||
|
parsed = _parse_metric_lines(summary)
|
||||||
|
panel_l = (panel or "").lower()
|
||||||
|
if panel_l.startswith("postgres connections"):
|
||||||
|
used = parsed.get("conn=used")
|
||||||
|
maxv = parsed.get("conn=max")
|
||||||
|
if used and maxv:
|
||||||
|
try:
|
||||||
|
used_i = int(float(used))
|
||||||
|
max_i = int(float(maxv))
|
||||||
|
except ValueError:
|
||||||
|
return f"Postgres connections: {summary}"
|
||||||
|
free = max_i - used_i
|
||||||
|
return f"Postgres connections: {used_i}/{max_i} used ({free} free)."
|
||||||
|
if panel_l.startswith("postgres hottest"):
|
||||||
|
if parsed:
|
||||||
|
label, value = next(iter(parsed.items()))
|
||||||
|
return f"Most Postgres connections: {label} = {value}."
|
||||||
|
return f"{panel}: {summary}"
|
||||||
|
|
||||||
|
def _node_ready_status(node: dict) -> bool | None:
|
||||||
|
conditions = node.get("status", {}).get("conditions") or []
|
||||||
|
for cond in conditions if isinstance(conditions, list) else []:
|
||||||
|
if cond.get("type") == "Ready":
|
||||||
|
if cond.get("status") == "True":
|
||||||
|
return True
|
||||||
|
if cond.get("status") == "False":
|
||||||
|
return False
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _node_is_worker(node: dict) -> bool:
|
||||||
|
labels = (node.get("metadata") or {}).get("labels") or {}
|
||||||
|
if labels.get("node-role.kubernetes.io/control-plane") is not None:
|
||||||
|
return False
|
||||||
|
if labels.get("node-role.kubernetes.io/master") is not None:
|
||||||
|
return False
|
||||||
|
if labels.get("node-role.kubernetes.io/worker") is not None:
|
||||||
|
return True
|
||||||
|
return True
|
||||||
|
|
||||||
|
def worker_nodes_status() -> tuple[list[str], list[str]]:
|
||||||
|
try:
|
||||||
|
data = k8s_get("/api/v1/nodes?limit=500")
|
||||||
|
except Exception:
|
||||||
|
return ([], [])
|
||||||
|
items = data.get("items") or []
|
||||||
|
ready_nodes: list[str] = []
|
||||||
|
not_ready_nodes: list[str] = []
|
||||||
|
for node in items if isinstance(items, list) else []:
|
||||||
|
if not _node_is_worker(node):
|
||||||
|
continue
|
||||||
|
name = (node.get("metadata") or {}).get("name") or ""
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
ready = _node_ready_status(node)
|
||||||
|
if ready is True:
|
||||||
|
ready_nodes.append(name)
|
||||||
|
elif ready is False:
|
||||||
|
not_ready_nodes.append(name)
|
||||||
|
return (sorted(ready_nodes), sorted(not_ready_nodes))
|
||||||
|
|
||||||
|
def expected_nodes_from_kb() -> set[str]:
|
||||||
|
if not _NODE_CLASS_INDEX:
|
||||||
|
return set()
|
||||||
|
nodes = set().union(*_NODE_CLASS_INDEX.values())
|
||||||
|
return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL}
|
||||||
|
|
||||||
|
def missing_nodes_answer(cluster_name: str) -> str:
|
||||||
|
expected = expected_nodes_from_kb()
|
||||||
|
if not expected:
|
||||||
|
return ""
|
||||||
|
current = set()
|
||||||
|
try:
|
||||||
|
data = k8s_get("/api/v1/nodes?limit=500")
|
||||||
|
items = data.get("items") or []
|
||||||
|
for node in items if isinstance(items, list) else []:
|
||||||
|
name = (node.get("metadata") or {}).get("name") or ""
|
||||||
|
if name:
|
||||||
|
current.add(name)
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
missing = sorted(expected - current)
|
||||||
|
if not missing:
|
||||||
|
return f"{cluster_name}: no missing nodes versus KB inventory."
|
||||||
|
return f"{cluster_name} missing nodes versus KB inventory: {', '.join(missing)}."
|
||||||
|
|
||||||
|
def _should_short_circuit(prompt: str, fallback: str) -> bool:
|
||||||
|
if not fallback:
|
||||||
|
return False
|
||||||
|
lower = (prompt or "").lower()
|
||||||
|
for word in ("why", "explain", "architecture", "breakdown", "root cause", "plan"):
|
||||||
|
if word in lower:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
def vm_top_restarts(hours: int = 1) -> str:
|
def vm_top_restarts(hours: int = 1) -> str:
|
||||||
q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))"
|
q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))"
|
||||||
res = vm_query(q)
|
res = vm_query(q)
|
||||||
@ -984,6 +1093,32 @@ def sync_loop(token: str, room_id: str):
|
|||||||
continue
|
continue
|
||||||
send_msg(token, rid, summary)
|
send_msg(token, rid, summary)
|
||||||
continue
|
continue
|
||||||
|
if "worker" in lower_body and "node" in lower_body:
|
||||||
|
ready_nodes, not_ready_nodes = worker_nodes_status()
|
||||||
|
total = len(ready_nodes) + len(not_ready_nodes)
|
||||||
|
if total:
|
||||||
|
if any(word in lower_body for word in ("ready", "not ready", "unready")):
|
||||||
|
if not_ready_nodes:
|
||||||
|
send_msg(
|
||||||
|
token,
|
||||||
|
rid,
|
||||||
|
f"Worker nodes not Ready: {', '.join(not_ready_nodes)}.",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
send_msg(token, rid, f"All {len(ready_nodes)} worker nodes are Ready.")
|
||||||
|
continue
|
||||||
|
if any(word in lower_body for word in ("how many", "should")):
|
||||||
|
send_msg(
|
||||||
|
token,
|
||||||
|
rid,
|
||||||
|
f"Atlas has {total} worker nodes; {len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady.",
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
if "missing" in lower_body and "node" in lower_body:
|
||||||
|
missing = missing_nodes_answer("Atlas")
|
||||||
|
if missing:
|
||||||
|
send_msg(token, rid, missing)
|
||||||
|
continue
|
||||||
inventory_answer = node_inventory_answer("Atlas", lower_body)
|
inventory_answer = node_inventory_answer("Atlas", lower_body)
|
||||||
if inventory_answer:
|
if inventory_answer:
|
||||||
send_msg(token, rid, inventory_answer)
|
send_msg(token, rid, inventory_answer)
|
||||||
@ -1046,6 +1181,9 @@ def sync_loop(token: str, room_id: str):
|
|||||||
fallback = node_inventory_answer("Atlas", lower_body)
|
fallback = node_inventory_answer("Atlas", lower_body)
|
||||||
if metrics_fallback and not fallback:
|
if metrics_fallback and not fallback:
|
||||||
fallback = metrics_fallback
|
fallback = metrics_fallback
|
||||||
|
if _should_short_circuit(body, fallback):
|
||||||
|
send_msg(token, rid, fallback)
|
||||||
|
continue
|
||||||
reply = ollama_reply_with_thinking(
|
reply = ollama_reply_with_thinking(
|
||||||
token,
|
token,
|
||||||
rid,
|
rid,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user