atlasbot: answer cluster queries without llm

This commit is contained in:
Brad Stein 2026-01-27 15:30:43 -03:00
parent 241a8889ee
commit b7792d30f1
2 changed files with 263 additions and 23 deletions

View File

@ -16,7 +16,7 @@ spec:
labels: labels:
app: atlasbot app: atlasbot
annotations: annotations:
checksum/atlasbot-configmap: manual-atlasbot-47 checksum/atlasbot-configmap: manual-atlasbot-48
vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "comms" vault.hashicorp.com/role: "comms"
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

View File

@ -532,7 +532,7 @@ def _detect_role_filters(q: str) -> set[str]:
return roles return roles
def _detect_entity(q: str) -> str | None: def _detect_entity(q: str) -> str | None:
if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q): if "node" in q or "nodes" in q or "worker" in q or "hardware" in q or "architecture" in q or TITAN_NODE_RE.search(q):
return "node" return "node"
if "pod" in q or "pods" in q: if "pod" in q or "pods" in q:
return "pod" return "pod"
@ -1152,6 +1152,15 @@ def snapshot_metric_answer(
if include_hw: if include_hw:
scope = f" among {' and '.join(sorted(include_hw))}" scope = f" among {' and '.join(sorted(include_hw))}"
answer = f"Hottest node{scope}: {node} ({value})." answer = f"Hottest node{scope}: {node} ({value})."
if allowed_nodes and len(allowed_nodes) != len(inventory):
overall = _node_usage_top(usage, allowed_nodes=None)
if overall and overall[0] != node:
overall_val = _format_metric_value(
str(overall[1]),
percent=percent,
rate=metric in {"net", "io"},
)
answer += f" Overall hottest: {overall[0]} ({overall_val})."
return _format_confidence(answer, "high") return _format_confidence(answer, "high")
if metric == "connections" or "postgres" in q: if metric == "connections" or "postgres" in q:
@ -1358,6 +1367,219 @@ def structured_answer(
return "" return ""
def _nodes_summary_line(inventory: list[dict[str, Any]], snapshot: dict[str, Any] | None) -> str:
summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {}
nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {}
total = summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total")
ready = summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready")
not_ready = summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready")
if total is None:
total = len(inventory)
ready = len([n for n in inventory if n.get("ready") is True])
not_ready = len([n for n in inventory if n.get("ready") is False])
if total is None:
return ""
return f"Atlas cluster has {total} nodes ({ready} ready, {not_ready} not ready)."
def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str:
if not inventory:
return ""
groups = _group_nodes(inventory)
parts: list[str] = []
for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
nodes = groups.get(key) or []
if nodes:
parts.append(f"{key}={len(nodes)}")
if not parts:
return ""
return "Hardware mix: " + ", ".join(parts) + "."
def _os_mix_line(snapshot: dict[str, Any] | None) -> str:
if not snapshot:
return ""
details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else []
counts: dict[str, int] = collections.Counter()
for node in details:
if not isinstance(node, dict):
continue
os_name = (node.get("os") or "").strip()
if os_name:
counts[os_name] += 1
if not counts:
return ""
parts = [f"{os_name}={count}" for os_name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))]
return "OS mix: " + ", ".join(parts[:5]) + "."
def _pods_summary_line(metrics: dict[str, Any]) -> str:
if not metrics:
return ""
running = metrics.get("pods_running")
pending = metrics.get("pods_pending")
failed = metrics.get("pods_failed")
succeeded = metrics.get("pods_succeeded")
parts: list[str] = []
if running is not None:
parts.append(f"{running:.0f} running")
if pending is not None:
parts.append(f"{pending:.0f} pending")
if failed is not None:
parts.append(f"{failed:.0f} failed")
if succeeded is not None:
parts.append(f"{succeeded:.0f} succeeded")
if not parts:
return ""
return "Pods: " + ", ".join(parts) + "."
def _postgres_summary_line(metrics: dict[str, Any]) -> str:
if not metrics:
return ""
postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
if not postgres:
return ""
used = postgres.get("used")
max_conn = postgres.get("max")
hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {}
parts: list[str] = []
if used is not None and max_conn is not None:
parts.append(f"{used:.0f}/{max_conn:.0f} connections")
if hottest.get("label"):
hot_val = hottest.get("value")
hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else ""
parts.append(f"hottest {hottest.get('label')} ({hot_val_str})")
if not parts:
return ""
return "Postgres: " + ", ".join(parts) + "."
def _hottest_summary_line(metrics: dict[str, Any]) -> str:
if not metrics:
return ""
hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {}
if not hottest:
return ""
parts: list[str] = []
for key in ("cpu", "ram", "net", "io"):
entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {}
node = entry.get("node")
value = entry.get("value")
if node and value is not None:
value_fmt = _format_metric_value(
str(value),
percent=key in ("cpu", "ram"),
rate=key in ("net", "io"),
)
parts.append(f"{key.upper()} {node} ({value_fmt})")
if not parts:
return ""
return "Hottest nodes: " + "; ".join(parts) + "."
def cluster_overview_answer(
prompt: str,
*,
inventory: list[dict[str, Any]],
snapshot: dict[str, Any] | None,
) -> str:
if not inventory and not snapshot:
return ""
q = normalize_query(prompt)
metrics = _snapshot_metrics(snapshot)
lines: list[str] = []
nodes_line = _nodes_summary_line(inventory, snapshot)
if nodes_line:
lines.append(nodes_line)
if any(word in q for word in ("hardware", "architecture", "nodes", "node", "cluster", "atlas", "titan", "lab")):
hw_line = _hardware_mix_line(inventory)
if hw_line:
lines.append(hw_line)
os_line = _os_mix_line(snapshot)
if os_line:
lines.append(os_line)
if any(
word in q
for word in (
"interesting",
"status",
"health",
"overview",
"summary",
"tell me",
"what do you know",
"about",
"pods",
"postgres",
"connections",
"hottest",
"cpu",
"ram",
"memory",
"net",
"network",
"io",
"disk",
"busy",
"load",
"usage",
"utilization",
)
):
pods_line = _pods_summary_line(metrics)
if pods_line:
lines.append(pods_line)
hottest_line = _hottest_summary_line(metrics)
if hottest_line:
lines.append(hottest_line)
postgres_line = _postgres_summary_line(metrics)
if postgres_line:
lines.append(postgres_line)
if not lines:
return ""
return "Based on the snapshot, " + "\n".join(lines)
def cluster_answer(
prompt: str,
*,
inventory: list[dict[str, Any]],
snapshot: dict[str, Any] | None,
workloads: list[dict[str, Any]] | None,
) -> str:
metrics_summary = snapshot_context(prompt, snapshot)
structured = structured_answer(
prompt,
inventory=inventory,
metrics_summary=metrics_summary,
snapshot=snapshot,
workloads=workloads,
)
if structured:
return structured
overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot)
if overview:
kb_titles = kb_retrieve_titles(prompt, limit=4) if _knowledge_intent(prompt) else ""
if kb_titles:
overview = overview + "\n" + kb_titles
return _format_confidence(overview, "medium")
kb_titles = kb_retrieve_titles(prompt, limit=4)
if kb_titles:
return _format_confidence(kb_titles, "low")
if metrics_summary:
return _format_confidence(metrics_summary, "low")
return ""
def _metric_tokens(entry: dict[str, Any]) -> str: def _metric_tokens(entry: dict[str, Any]) -> str:
parts: list[str] = [] parts: list[str] = []
for key in ("panel_title", "dashboard", "description"): for key in ("panel_title", "dashboard", "description"):
@ -1868,16 +2090,24 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
workloads=workloads, workloads=workloads,
) )
fallback = "I don't have enough data to answer that." fallback = "I don't have enough data to answer that."
llm_prompt = cleaned
if cluster_query: if cluster_query:
llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned}" answer = cluster_answer(
answer = ollama_reply( cleaned,
("http", "internal"), inventory=inventory,
llm_prompt, snapshot=snapshot,
context=context, workloads=workloads,
fallback=fallback, )
use_history=False, if not answer:
) answer = fallback
else:
llm_prompt = cleaned
answer = ollama_reply(
("http", "internal"),
llm_prompt,
context=context,
fallback=fallback,
use_history=False,
)
self._write_json(200, {"answer": answer}) self._write_json(200, {"answer": answer})
@ -2044,6 +2274,7 @@ def _knowledge_intent(prompt: str) -> bool:
for phrase in ( for phrase in (
"what do you know", "what do you know",
"tell me about", "tell me about",
"interesting",
"overview", "overview",
"summary", "summary",
"describe", "describe",
@ -2312,21 +2543,30 @@ def sync_loop(token: str, room_id: str):
res = vm_query(promql, timeout=20) res = vm_query(promql, timeout=20)
rendered = vm_render_result(res, limit=15) or "(no results)" rendered = vm_render_result(res, limit=15) or "(no results)"
extra = "VictoriaMetrics (PromQL result):\n" + rendered extra = "VictoriaMetrics (PromQL result):\n" + rendered
context = (context + "\n\n" + extra).strip() if context else extra send_msg(token, rid, extra)
continue
fallback = "I don't have enough data to answer that." fallback = "I don't have enough data to answer that."
llm_prompt = cleaned_body
if cluster_query: if cluster_query:
llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned_body}" reply = cluster_answer(
reply = ollama_reply_with_thinking( cleaned_body,
token, inventory=inventory,
rid, snapshot=snapshot,
hist_key, workloads=workloads,
llm_prompt, )
context=context, if not reply:
fallback=fallback, reply = fallback
use_history=cluster_query, else:
) llm_prompt = cleaned_body
reply = ollama_reply_with_thinking(
token,
rid,
hist_key,
llm_prompt,
context=context,
fallback=fallback,
use_history=False,
)
send_msg(token, rid, reply) send_msg(token, rid, reply)
def login_with_retry(): def login_with_retry():