atlasbot: answer cluster queries without llm
This commit is contained in:
parent
241a8889ee
commit
b7792d30f1
@ -16,7 +16,7 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
app: atlasbot
|
app: atlasbot
|
||||||
annotations:
|
annotations:
|
||||||
checksum/atlasbot-configmap: manual-atlasbot-47
|
checksum/atlasbot-configmap: manual-atlasbot-48
|
||||||
vault.hashicorp.com/agent-inject: "true"
|
vault.hashicorp.com/agent-inject: "true"
|
||||||
vault.hashicorp.com/role: "comms"
|
vault.hashicorp.com/role: "comms"
|
||||||
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
|
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
|
||||||
|
|||||||
@ -532,7 +532,7 @@ def _detect_role_filters(q: str) -> set[str]:
|
|||||||
return roles
|
return roles
|
||||||
|
|
||||||
def _detect_entity(q: str) -> str | None:
|
def _detect_entity(q: str) -> str | None:
|
||||||
if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q):
|
if "node" in q or "nodes" in q or "worker" in q or "hardware" in q or "architecture" in q or TITAN_NODE_RE.search(q):
|
||||||
return "node"
|
return "node"
|
||||||
if "pod" in q or "pods" in q:
|
if "pod" in q or "pods" in q:
|
||||||
return "pod"
|
return "pod"
|
||||||
@ -1152,6 +1152,15 @@ def snapshot_metric_answer(
|
|||||||
if include_hw:
|
if include_hw:
|
||||||
scope = f" among {' and '.join(sorted(include_hw))}"
|
scope = f" among {' and '.join(sorted(include_hw))}"
|
||||||
answer = f"Hottest node{scope}: {node} ({value})."
|
answer = f"Hottest node{scope}: {node} ({value})."
|
||||||
|
if allowed_nodes and len(allowed_nodes) != len(inventory):
|
||||||
|
overall = _node_usage_top(usage, allowed_nodes=None)
|
||||||
|
if overall and overall[0] != node:
|
||||||
|
overall_val = _format_metric_value(
|
||||||
|
str(overall[1]),
|
||||||
|
percent=percent,
|
||||||
|
rate=metric in {"net", "io"},
|
||||||
|
)
|
||||||
|
answer += f" Overall hottest: {overall[0]} ({overall_val})."
|
||||||
return _format_confidence(answer, "high")
|
return _format_confidence(answer, "high")
|
||||||
|
|
||||||
if metric == "connections" or "postgres" in q:
|
if metric == "connections" or "postgres" in q:
|
||||||
@ -1358,6 +1367,219 @@ def structured_answer(
|
|||||||
|
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _nodes_summary_line(inventory: list[dict[str, Any]], snapshot: dict[str, Any] | None) -> str:
|
||||||
|
summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {}
|
||||||
|
nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {}
|
||||||
|
total = summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total")
|
||||||
|
ready = summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready")
|
||||||
|
not_ready = summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready")
|
||||||
|
if total is None:
|
||||||
|
total = len(inventory)
|
||||||
|
ready = len([n for n in inventory if n.get("ready") is True])
|
||||||
|
not_ready = len([n for n in inventory if n.get("ready") is False])
|
||||||
|
if total is None:
|
||||||
|
return ""
|
||||||
|
return f"Atlas cluster has {total} nodes ({ready} ready, {not_ready} not ready)."
|
||||||
|
|
||||||
|
|
||||||
|
def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str:
|
||||||
|
if not inventory:
|
||||||
|
return ""
|
||||||
|
groups = _group_nodes(inventory)
|
||||||
|
parts: list[str] = []
|
||||||
|
for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
|
||||||
|
nodes = groups.get(key) or []
|
||||||
|
if nodes:
|
||||||
|
parts.append(f"{key}={len(nodes)}")
|
||||||
|
if not parts:
|
||||||
|
return ""
|
||||||
|
return "Hardware mix: " + ", ".join(parts) + "."
|
||||||
|
|
||||||
|
|
||||||
|
def _os_mix_line(snapshot: dict[str, Any] | None) -> str:
|
||||||
|
if not snapshot:
|
||||||
|
return ""
|
||||||
|
details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else []
|
||||||
|
counts: dict[str, int] = collections.Counter()
|
||||||
|
for node in details:
|
||||||
|
if not isinstance(node, dict):
|
||||||
|
continue
|
||||||
|
os_name = (node.get("os") or "").strip()
|
||||||
|
if os_name:
|
||||||
|
counts[os_name] += 1
|
||||||
|
if not counts:
|
||||||
|
return ""
|
||||||
|
parts = [f"{os_name}={count}" for os_name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))]
|
||||||
|
return "OS mix: " + ", ".join(parts[:5]) + "."
|
||||||
|
|
||||||
|
|
||||||
|
def _pods_summary_line(metrics: dict[str, Any]) -> str:
|
||||||
|
if not metrics:
|
||||||
|
return ""
|
||||||
|
running = metrics.get("pods_running")
|
||||||
|
pending = metrics.get("pods_pending")
|
||||||
|
failed = metrics.get("pods_failed")
|
||||||
|
succeeded = metrics.get("pods_succeeded")
|
||||||
|
parts: list[str] = []
|
||||||
|
if running is not None:
|
||||||
|
parts.append(f"{running:.0f} running")
|
||||||
|
if pending is not None:
|
||||||
|
parts.append(f"{pending:.0f} pending")
|
||||||
|
if failed is not None:
|
||||||
|
parts.append(f"{failed:.0f} failed")
|
||||||
|
if succeeded is not None:
|
||||||
|
parts.append(f"{succeeded:.0f} succeeded")
|
||||||
|
if not parts:
|
||||||
|
return ""
|
||||||
|
return "Pods: " + ", ".join(parts) + "."
|
||||||
|
|
||||||
|
|
||||||
|
def _postgres_summary_line(metrics: dict[str, Any]) -> str:
|
||||||
|
if not metrics:
|
||||||
|
return ""
|
||||||
|
postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
|
||||||
|
if not postgres:
|
||||||
|
return ""
|
||||||
|
used = postgres.get("used")
|
||||||
|
max_conn = postgres.get("max")
|
||||||
|
hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {}
|
||||||
|
parts: list[str] = []
|
||||||
|
if used is not None and max_conn is not None:
|
||||||
|
parts.append(f"{used:.0f}/{max_conn:.0f} connections")
|
||||||
|
if hottest.get("label"):
|
||||||
|
hot_val = hottest.get("value")
|
||||||
|
hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else ""
|
||||||
|
parts.append(f"hottest {hottest.get('label')} ({hot_val_str})")
|
||||||
|
if not parts:
|
||||||
|
return ""
|
||||||
|
return "Postgres: " + ", ".join(parts) + "."
|
||||||
|
|
||||||
|
|
||||||
|
def _hottest_summary_line(metrics: dict[str, Any]) -> str:
|
||||||
|
if not metrics:
|
||||||
|
return ""
|
||||||
|
hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {}
|
||||||
|
if not hottest:
|
||||||
|
return ""
|
||||||
|
parts: list[str] = []
|
||||||
|
for key in ("cpu", "ram", "net", "io"):
|
||||||
|
entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {}
|
||||||
|
node = entry.get("node")
|
||||||
|
value = entry.get("value")
|
||||||
|
if node and value is not None:
|
||||||
|
value_fmt = _format_metric_value(
|
||||||
|
str(value),
|
||||||
|
percent=key in ("cpu", "ram"),
|
||||||
|
rate=key in ("net", "io"),
|
||||||
|
)
|
||||||
|
parts.append(f"{key.upper()} {node} ({value_fmt})")
|
||||||
|
if not parts:
|
||||||
|
return ""
|
||||||
|
return "Hottest nodes: " + "; ".join(parts) + "."
|
||||||
|
|
||||||
|
|
||||||
|
def cluster_overview_answer(
|
||||||
|
prompt: str,
|
||||||
|
*,
|
||||||
|
inventory: list[dict[str, Any]],
|
||||||
|
snapshot: dict[str, Any] | None,
|
||||||
|
) -> str:
|
||||||
|
if not inventory and not snapshot:
|
||||||
|
return ""
|
||||||
|
q = normalize_query(prompt)
|
||||||
|
metrics = _snapshot_metrics(snapshot)
|
||||||
|
lines: list[str] = []
|
||||||
|
|
||||||
|
nodes_line = _nodes_summary_line(inventory, snapshot)
|
||||||
|
if nodes_line:
|
||||||
|
lines.append(nodes_line)
|
||||||
|
|
||||||
|
if any(word in q for word in ("hardware", "architecture", "nodes", "node", "cluster", "atlas", "titan", "lab")):
|
||||||
|
hw_line = _hardware_mix_line(inventory)
|
||||||
|
if hw_line:
|
||||||
|
lines.append(hw_line)
|
||||||
|
os_line = _os_mix_line(snapshot)
|
||||||
|
if os_line:
|
||||||
|
lines.append(os_line)
|
||||||
|
|
||||||
|
if any(
|
||||||
|
word in q
|
||||||
|
for word in (
|
||||||
|
"interesting",
|
||||||
|
"status",
|
||||||
|
"health",
|
||||||
|
"overview",
|
||||||
|
"summary",
|
||||||
|
"tell me",
|
||||||
|
"what do you know",
|
||||||
|
"about",
|
||||||
|
"pods",
|
||||||
|
"postgres",
|
||||||
|
"connections",
|
||||||
|
"hottest",
|
||||||
|
"cpu",
|
||||||
|
"ram",
|
||||||
|
"memory",
|
||||||
|
"net",
|
||||||
|
"network",
|
||||||
|
"io",
|
||||||
|
"disk",
|
||||||
|
"busy",
|
||||||
|
"load",
|
||||||
|
"usage",
|
||||||
|
"utilization",
|
||||||
|
)
|
||||||
|
):
|
||||||
|
pods_line = _pods_summary_line(metrics)
|
||||||
|
if pods_line:
|
||||||
|
lines.append(pods_line)
|
||||||
|
hottest_line = _hottest_summary_line(metrics)
|
||||||
|
if hottest_line:
|
||||||
|
lines.append(hottest_line)
|
||||||
|
postgres_line = _postgres_summary_line(metrics)
|
||||||
|
if postgres_line:
|
||||||
|
lines.append(postgres_line)
|
||||||
|
|
||||||
|
if not lines:
|
||||||
|
return ""
|
||||||
|
return "Based on the snapshot, " + "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def cluster_answer(
|
||||||
|
prompt: str,
|
||||||
|
*,
|
||||||
|
inventory: list[dict[str, Any]],
|
||||||
|
snapshot: dict[str, Any] | None,
|
||||||
|
workloads: list[dict[str, Any]] | None,
|
||||||
|
) -> str:
|
||||||
|
metrics_summary = snapshot_context(prompt, snapshot)
|
||||||
|
structured = structured_answer(
|
||||||
|
prompt,
|
||||||
|
inventory=inventory,
|
||||||
|
metrics_summary=metrics_summary,
|
||||||
|
snapshot=snapshot,
|
||||||
|
workloads=workloads,
|
||||||
|
)
|
||||||
|
if structured:
|
||||||
|
return structured
|
||||||
|
|
||||||
|
overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot)
|
||||||
|
if overview:
|
||||||
|
kb_titles = kb_retrieve_titles(prompt, limit=4) if _knowledge_intent(prompt) else ""
|
||||||
|
if kb_titles:
|
||||||
|
overview = overview + "\n" + kb_titles
|
||||||
|
return _format_confidence(overview, "medium")
|
||||||
|
|
||||||
|
kb_titles = kb_retrieve_titles(prompt, limit=4)
|
||||||
|
if kb_titles:
|
||||||
|
return _format_confidence(kb_titles, "low")
|
||||||
|
|
||||||
|
if metrics_summary:
|
||||||
|
return _format_confidence(metrics_summary, "low")
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
def _metric_tokens(entry: dict[str, Any]) -> str:
|
def _metric_tokens(entry: dict[str, Any]) -> str:
|
||||||
parts: list[str] = []
|
parts: list[str] = []
|
||||||
for key in ("panel_title", "dashboard", "description"):
|
for key in ("panel_title", "dashboard", "description"):
|
||||||
@ -1868,16 +2090,24 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
|
|||||||
workloads=workloads,
|
workloads=workloads,
|
||||||
)
|
)
|
||||||
fallback = "I don't have enough data to answer that."
|
fallback = "I don't have enough data to answer that."
|
||||||
llm_prompt = cleaned
|
|
||||||
if cluster_query:
|
if cluster_query:
|
||||||
llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned}"
|
answer = cluster_answer(
|
||||||
answer = ollama_reply(
|
cleaned,
|
||||||
("http", "internal"),
|
inventory=inventory,
|
||||||
llm_prompt,
|
snapshot=snapshot,
|
||||||
context=context,
|
workloads=workloads,
|
||||||
fallback=fallback,
|
)
|
||||||
use_history=False,
|
if not answer:
|
||||||
)
|
answer = fallback
|
||||||
|
else:
|
||||||
|
llm_prompt = cleaned
|
||||||
|
answer = ollama_reply(
|
||||||
|
("http", "internal"),
|
||||||
|
llm_prompt,
|
||||||
|
context=context,
|
||||||
|
fallback=fallback,
|
||||||
|
use_history=False,
|
||||||
|
)
|
||||||
self._write_json(200, {"answer": answer})
|
self._write_json(200, {"answer": answer})
|
||||||
|
|
||||||
|
|
||||||
@ -2044,6 +2274,7 @@ def _knowledge_intent(prompt: str) -> bool:
|
|||||||
for phrase in (
|
for phrase in (
|
||||||
"what do you know",
|
"what do you know",
|
||||||
"tell me about",
|
"tell me about",
|
||||||
|
"interesting",
|
||||||
"overview",
|
"overview",
|
||||||
"summary",
|
"summary",
|
||||||
"describe",
|
"describe",
|
||||||
@ -2312,21 +2543,30 @@ def sync_loop(token: str, room_id: str):
|
|||||||
res = vm_query(promql, timeout=20)
|
res = vm_query(promql, timeout=20)
|
||||||
rendered = vm_render_result(res, limit=15) or "(no results)"
|
rendered = vm_render_result(res, limit=15) or "(no results)"
|
||||||
extra = "VictoriaMetrics (PromQL result):\n" + rendered
|
extra = "VictoriaMetrics (PromQL result):\n" + rendered
|
||||||
context = (context + "\n\n" + extra).strip() if context else extra
|
send_msg(token, rid, extra)
|
||||||
|
continue
|
||||||
fallback = "I don't have enough data to answer that."
|
fallback = "I don't have enough data to answer that."
|
||||||
|
|
||||||
llm_prompt = cleaned_body
|
|
||||||
if cluster_query:
|
if cluster_query:
|
||||||
llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned_body}"
|
reply = cluster_answer(
|
||||||
reply = ollama_reply_with_thinking(
|
cleaned_body,
|
||||||
token,
|
inventory=inventory,
|
||||||
rid,
|
snapshot=snapshot,
|
||||||
hist_key,
|
workloads=workloads,
|
||||||
llm_prompt,
|
)
|
||||||
context=context,
|
if not reply:
|
||||||
fallback=fallback,
|
reply = fallback
|
||||||
use_history=cluster_query,
|
else:
|
||||||
)
|
llm_prompt = cleaned_body
|
||||||
|
reply = ollama_reply_with_thinking(
|
||||||
|
token,
|
||||||
|
rid,
|
||||||
|
hist_key,
|
||||||
|
llm_prompt,
|
||||||
|
context=context,
|
||||||
|
fallback=fallback,
|
||||||
|
use_history=False,
|
||||||
|
)
|
||||||
send_msg(token, rid, reply)
|
send_msg(token, rid, reply)
|
||||||
|
|
||||||
def login_with_retry():
|
def login_with_retry():
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user