atlasbot: strengthen facts context and replies

This commit is contained in:
Brad Stein 2026-01-27 11:03:55 -03:00
parent c0dd00c93d
commit a442ea6d5d

View File

@ -34,6 +34,7 @@ SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500")) MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500")) MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000")) MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000"))
MAX_CONTEXT_CHARS = int(os.environ.get("ATLASBOT_MAX_CONTEXT_CHARS", "12000"))
THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120")) THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120"))
OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2")) OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2"))
OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false" OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false"
@ -100,6 +101,7 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE) TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE) TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
_DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D" _DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
CONFIDENCE_RE = re.compile(r"confidence\s*:\s*(high|medium|low)\b", re.IGNORECASE)
OPERATION_HINTS = { OPERATION_HINTS = {
"count": ("how many", "count", "number", "total"), "count": ("how many", "count", "number", "total"),
@ -139,6 +141,20 @@ def _tokens(text: str) -> list[str]:
return [t for t in toks if t not in STOPWORDS and len(t) >= 2] return [t for t in toks if t not in STOPWORDS and len(t) >= 2]
def _ensure_confidence(text: str) -> str:
if not text:
return ""
lines = text.strip().splitlines()
for idx, line in enumerate(lines):
match = CONFIDENCE_RE.search(line)
if match:
level = match.group(1).lower()
lines[idx] = f"Confidence: {level}"
return "\n".join(lines)
lines.append("Confidence: medium")
return "\n".join(lines)
# Mention detection (Matrix rich mentions + plain @atlas). # Mention detection (Matrix rich mentions + plain @atlas).
MENTION_TOKENS = [m.strip() for m in BOT_MENTIONS.split(",") if m.strip()] MENTION_TOKENS = [m.strip() for m in BOT_MENTIONS.split(",") if m.strip()]
MENTION_LOCALPARTS = [m.lstrip("@").split(":", 1)[0] for m in MENTION_TOKENS] MENTION_LOCALPARTS = [m.lstrip("@").split(":", 1)[0] for m in MENTION_TOKENS]
@ -710,6 +726,7 @@ def facts_context(
workloads: list[dict[str, Any]] | None, workloads: list[dict[str, Any]] | None,
) -> str: ) -> str:
inv = inventory or [] inv = inventory or []
nodes_in_query = _extract_titan_nodes(prompt)
metrics = _snapshot_metrics(snapshot) metrics = _snapshot_metrics(snapshot)
nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {} nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {}
summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {} summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {}
@ -721,6 +738,12 @@ def facts_context(
not_ready_names = summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names") not_ready_names = summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names")
by_hardware = _group_nodes(inv) if inv else {} by_hardware = _group_nodes(inv) if inv else {}
by_arch = _nodes_by_arch(inv) if inv else {} by_arch = _nodes_by_arch(inv) if inv else {}
control_plane_nodes = [
node["name"]
for node in inv
if any(role in ("control-plane", "master") for role in (node.get("roles") or []))
]
worker_nodes = [node["name"] for node in inv if node.get("is_worker") is True]
lines: list[str] = ["Facts (live snapshot):"] lines: list[str] = ["Facts (live snapshot):"]
if total is not None: if total is not None:
@ -731,9 +754,16 @@ def facts_context(
nodes_list = by_hardware.get(key) or [] nodes_list = by_hardware.get(key) or []
if nodes_list: if nodes_list:
lines.append(f"- {key}: {', '.join(nodes_list)}") lines.append(f"- {key}: {', '.join(nodes_list)}")
non_rpi = sorted(set(by_hardware.get("jetson", [])) | set(by_hardware.get("amd64", [])))
if non_rpi:
lines.append(f"- non_raspberry_pi: {', '.join(non_rpi)}")
for key, nodes_list in sorted(by_arch.items()): for key, nodes_list in sorted(by_arch.items()):
if nodes_list: if nodes_list:
lines.append(f"- arch {key}: {', '.join(nodes_list)}") lines.append(f"- arch {key}: {', '.join(nodes_list)}")
if control_plane_nodes:
lines.append(f"- control_plane_nodes: {', '.join(control_plane_nodes)}")
if worker_nodes:
lines.append(f"- worker_nodes: {', '.join(worker_nodes)}")
if ready_workers or not_ready_workers: if ready_workers or not_ready_workers:
lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}") lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}")
if not_ready_workers: if not_ready_workers:
@ -753,7 +783,8 @@ def facts_context(
node = entry.get("node") node = entry.get("node")
value = entry.get("value") value = entry.get("value")
if node and value is not None: if node and value is not None:
lines.append(f"- hottest_{key}: {node} ({value})") value_fmt = _format_metric_value(str(value), percent=key in ("cpu", "ram"))
lines.append(f"- hottest_{key}: {node} ({value_fmt})")
postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
if isinstance(postgres, dict) and postgres: if isinstance(postgres, dict) and postgres:
@ -774,12 +805,25 @@ def facts_context(
node = entry.get("node") node = entry.get("node")
if not node: if not node:
continue continue
cpu = entry.get("cpu") cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else ""
ram = entry.get("ram") ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else ""
net = entry.get("net") net = _format_metric_value(str(entry.get("net")), percent=False) if entry.get("net") is not None else ""
io_val = entry.get("io") io_val = _format_metric_value(str(entry.get("io")), percent=False) if entry.get("io") is not None else ""
lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}") lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}")
if nodes_in_query:
lines.append("- node_details:")
for name in nodes_in_query:
detail = next((n for n in inv if n.get("name") == name), None)
if not detail:
lines.append(f" - {name}: not found in snapshot")
continue
roles = ",".join(detail.get("roles") or []) or "none"
lines.append(
f" - {name}: hardware={detail.get('hardware')}, arch={detail.get('arch')}, "
f"ready={detail.get('ready')}, roles={roles}"
)
workload_entries = _workloads_for_prompt(prompt, workloads or []) workload_entries = _workloads_for_prompt(prompt, workloads or [])
if workload_entries: if workload_entries:
lines.append("- workloads:") lines.append("- workloads:")
@ -1181,11 +1225,10 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
if rendered: if rendered:
rendered_parts.append(rendered) rendered_parts.append(rendered)
if not rendered_parts: if not rendered_parts:
return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data." return "", ""
summary = "\n".join(rendered_parts) summary = "\n".join(rendered_parts)
context = f"Metrics (from {dashboard} / {panel}):\n{summary}" context = f"Metrics (from {dashboard} / {panel}):\n{summary}"
fallback = _metrics_fallback_summary(panel, summary) return context, ""
return context, fallback
def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]: def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]:
q = (query or "").strip() q = (query or "").strip()
@ -1574,8 +1617,8 @@ def _normalize_reply(value: Any) -> str:
try: try:
return _normalize_reply(json.loads(text)) return _normalize_reply(json.loads(text))
except Exception: except Exception:
return text return _ensure_confidence(text)
return text return _ensure_confidence(text)
# Internal HTTP endpoint for cluster answers (website uses this). # Internal HTTP endpoint for cluster answers (website uses this).
@ -1634,10 +1677,10 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
snapshot=snapshot, snapshot=snapshot,
workloads=workloads, workloads=workloads,
) )
metrics_context, metrics_fallback = metrics_query_context(prompt, allow_tools=True) metrics_context, _metrics_fallback = metrics_query_context(prompt, allow_tools=True)
if metrics_context: if metrics_context:
context = (context + "\n\n" + metrics_context).strip() if context else metrics_context context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that." fallback = "I don't have enough data to answer that."
answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback) answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback)
self._write_json(200, {"answer": answer}) self._write_json(200, {"answer": answer})
@ -1665,19 +1708,19 @@ def build_context(
) -> str: ) -> str:
parts: list[str] = [] parts: list[str] = []
kb = kb_retrieve(prompt) facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
if not kb and _knowledge_intent(prompt): if facts:
kb = kb_retrieve_titles(prompt, limit=4) parts.append(facts)
if kb:
parts.append(kb)
endpoints, edges = catalog_hints(prompt) endpoints, edges = catalog_hints(prompt)
if endpoints: if endpoints:
parts.append(endpoints) parts.append(endpoints)
facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) kb = kb_retrieve(prompt)
if facts: if not kb and _knowledge_intent(prompt):
parts.append(facts) kb = kb_retrieve_titles(prompt, limit=4)
if kb:
parts.append(kb)
if allow_tools: if allow_tools:
# Scope pod summaries to relevant namespaces/workloads when possible. # Scope pod summaries to relevant namespaces/workloads when possible.
@ -1789,12 +1832,14 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
"Never include or request secret values. " "Never include or request secret values. "
"Do not suggest commands unless explicitly asked. " "Do not suggest commands unless explicitly asked. "
"Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. "
"Translate metrics into natural language instead of echoing raw label/value pairs. "
"Do not answer by only listing runbooks; summarize the cluster first and mention docs only if useful. "
"If the answer is not grounded in the provided context or tool data, say you do not know. " "If the answer is not grounded in the provided context or tool data, say you do not know. "
"End every response with a line: 'Confidence: high|medium|low'." "End every response with a line: 'Confidence: high|medium|low'."
) )
transcript_parts = [system] transcript_parts = [system]
if context: if context:
transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS]) transcript_parts.append("Context (grounded):\n" + context[:MAX_CONTEXT_CHARS])
transcript_parts.extend(history[hist_key][-24:]) transcript_parts.extend(history[hist_key][-24:])
transcript_parts.append(f"User: {prompt}") transcript_parts.append(f"User: {prompt}")
transcript = "\n".join(transcript_parts) transcript = "\n".join(transcript_parts)
@ -1950,11 +1995,11 @@ def sync_loop(token: str, room_id: str):
rendered = vm_render_result(res, limit=15) or "(no results)" rendered = vm_render_result(res, limit=15) or "(no results)"
extra = "VictoriaMetrics (PromQL result):\n" + rendered extra = "VictoriaMetrics (PromQL result):\n" + rendered
context = (context + "\n\n" + extra).strip() if context else extra context = (context + "\n\n" + extra).strip() if context else extra
metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics) metrics_context, _metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics)
if metrics_context: if metrics_context:
context = (context + "\n\n" + metrics_context).strip() if context else metrics_context context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that." fallback = "I don't have enough data to answer that."
reply = ollama_reply_with_thinking( reply = ollama_reply_with_thinking(
token, token,