From b7543d7e579ed669f3a6d3b333af6661cd9b6779 Mon Sep 17 00:00:00 2001 From: jenkins Date: Tue, 21 Apr 2026 00:53:47 -0300 Subject: [PATCH] quality(atlasbot): enforce strict gate split --- Dockerfile | 2 + Jenkinsfile | 4 + atlasbot/api/http.py | 13 +- atlasbot/config.py | 7 + atlasbot/engine/answerer.py | 3589 --------------------- atlasbot/engine/answerer/__init__.py | 12 + atlasbot/engine/answerer/_base.py | 116 + atlasbot/engine/answerer/common.py | 395 +++ atlasbot/engine/answerer/engine.py | 267 ++ atlasbot/engine/answerer/factsheet.py | 189 ++ atlasbot/engine/answerer/post.py | 459 +++ atlasbot/engine/answerer/post_ext.py | 276 ++ atlasbot/engine/answerer/retrieval.py | 344 ++ atlasbot/engine/answerer/retrieval_ext.py | 197 ++ atlasbot/engine/answerer/spine.py | 404 +++ atlasbot/engine/answerer/workflow.py | 484 +++ atlasbot/engine/answerer/workflow_post.py | 170 + atlasbot/engine/intent_router.py | 43 +- atlasbot/knowledge/loader.py | 14 +- atlasbot/llm/client.py | 8 + atlasbot/llm/prompts.py | 2 +- atlasbot/logging.py | 10 +- atlasbot/main.py | 15 +- atlasbot/matrix/bot.py | 28 +- atlasbot/queue/nats.py | 11 +- atlasbot/snapshot/builder.py | 1992 ------------ atlasbot/snapshot/builder/__init__.py | 8 + atlasbot/snapshot/builder/core_a.py | 492 +++ atlasbot/snapshot/builder/core_b.py | 57 + atlasbot/snapshot/builder/format_a.py | 497 +++ atlasbot/snapshot/builder/format_b.py | 435 +++ atlasbot/snapshot/builder/format_c.py | 448 +++ atlasbot/snapshot/builder/summary_text.py | 72 + atlasbot/state/store.py | 11 + pyproject.toml | 21 + scripts/check_coverage.py | 42 + scripts/check_docstrings.py | 83 + scripts/check_file_sizes.py | 70 + testing/__init__.py | 2 + testing/fakes.py | 108 + tests/test_engine.py | 117 +- tests/test_quality_gate_paths.py | 810 +++++ tests/test_split_helper_coverage.py | 1749 ++++++++++ tests/test_support_modules.py | 1424 ++++++++ 44 files changed, 9781 insertions(+), 5716 deletions(-) delete mode 100644 atlasbot/engine/answerer.py create mode 100644 atlasbot/engine/answerer/__init__.py create mode 100644 atlasbot/engine/answerer/_base.py create mode 100644 atlasbot/engine/answerer/common.py create mode 100644 atlasbot/engine/answerer/engine.py create mode 100644 atlasbot/engine/answerer/factsheet.py create mode 100644 atlasbot/engine/answerer/post.py create mode 100644 atlasbot/engine/answerer/post_ext.py create mode 100644 atlasbot/engine/answerer/retrieval.py create mode 100644 atlasbot/engine/answerer/retrieval_ext.py create mode 100644 atlasbot/engine/answerer/spine.py create mode 100644 atlasbot/engine/answerer/workflow.py create mode 100644 atlasbot/engine/answerer/workflow_post.py delete mode 100644 atlasbot/snapshot/builder.py create mode 100644 atlasbot/snapshot/builder/__init__.py create mode 100644 atlasbot/snapshot/builder/core_a.py create mode 100644 atlasbot/snapshot/builder/core_b.py create mode 100644 atlasbot/snapshot/builder/format_a.py create mode 100644 atlasbot/snapshot/builder/format_b.py create mode 100644 atlasbot/snapshot/builder/format_c.py create mode 100644 atlasbot/snapshot/builder/summary_text.py create mode 100644 pyproject.toml create mode 100755 scripts/check_coverage.py create mode 100755 scripts/check_docstrings.py create mode 100755 scripts/check_file_sizes.py create mode 100644 testing/__init__.py create mode 100644 testing/fakes.py create mode 100644 tests/test_quality_gate_paths.py create mode 100644 tests/test_split_helper_coverage.py create mode 100644 tests/test_support_modules.py diff --git a/Dockerfile b/Dockerfile index 6d00cec..7e1c9a2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,11 +6,13 @@ ENV PYTHONDONTWRITEBYTECODE=1 \ WORKDIR /app COPY requirements.txt /app/requirements.txt COPY requirements-dev.txt /app/requirements-dev.txt +COPY pyproject.toml /app/pyproject.toml RUN pip install --no-cache-dir -r /app/requirements.txt -r /app/requirements-dev.txt COPY atlasbot /app/atlasbot FROM base AS test +COPY testing /app/testing COPY tests /app/tests COPY scripts /app/scripts diff --git a/Jenkinsfile b/Jenkinsfile index 32e64f8..3724777 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -75,6 +75,10 @@ spec: QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json' QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json' } + options { + disableConcurrentBuilds() + buildDiscarder(logRotator(daysToKeepStr: '30', numToKeepStr: '200', artifactDaysToKeepStr: '30', artifactNumToKeepStr: '120')) + } stages { stage('Checkout') { steps { diff --git a/atlasbot/api/http.py b/atlasbot/api/http.py index 49de446..e2d69e1 100644 --- a/atlasbot/api/http.py +++ b/atlasbot/api/http.py @@ -1,7 +1,6 @@ import logging -from typing import Any - from collections.abc import Awaitable, Callable +from typing import Any from fastapi import FastAPI, Header, HTTPException from pydantic import BaseModel @@ -29,6 +28,16 @@ class AnswerResponse(BaseModel): class Api: + """Expose the answer API and enforce the shared internal token. + + Input: + - `settings`: runtime configuration, including the optional internal token; + - `answer_handler`: async adapter that answers a normalized question. + + Output: + - registers the HTTP routes on `self.app`. + """ + def __init__( self, settings: Settings, diff --git a/atlasbot/config.py b/atlasbot/config.py index bc0d321..99977ba 100644 --- a/atlasbot/config.py +++ b/atlasbot/config.py @@ -1,6 +1,7 @@ import os from dataclasses import dataclass + def _env_bool(name: str, default: str = "false") -> bool: value = os.getenv(name, default).strip().lower() return value in {"1", "true", "yes", "y", "on"} @@ -121,6 +122,12 @@ def _load_matrix_bots(bot_mentions: tuple[str, ...]) -> tuple[MatrixBotConfig, . def load_settings() -> Settings: + """Load process settings from environment variables. + + Output: + - a fully populated `Settings` instance with defaults for missing values. + """ + bot_mentions = tuple( [ item.strip() diff --git a/atlasbot/engine/answerer.py b/atlasbot/engine/answerer.py deleted file mode 100644 index 1906537..0000000 --- a/atlasbot/engine/answerer.py +++ /dev/null @@ -1,3589 +0,0 @@ -import asyncio -import json -import logging -import math -import re -import time -import difflib -from collections.abc import Awaitable -from dataclasses import dataclass -from typing import Any, Callable - -from atlasbot.config import Settings -from atlasbot.knowledge.loader import KnowledgeBase -from atlasbot.llm.client import LLMClient, build_messages, parse_json -from atlasbot.llm import prompts -from atlasbot.snapshot.builder import SnapshotProvider, build_summary, summary_text -from atlasbot.state.store import ClaimStore -from atlasbot.engine.intent_router import IntentMatch, route_intent - -log = logging.getLogger(__name__) - -FOLLOWUP_SHORT_WORDS = 6 -TOKEN_MIN_LEN = 3 -GENERIC_METRIC_TOKENS = {"atlas", "cluster", "kubernetes", "k8s", "titan", "lab"} -NS_ENTRY_MIN_LEN = 2 -DEDUP_MIN_SENTENCES = 3 -RUNBOOK_SIMILARITY_THRESHOLD = 0.4 -BYTES_KB = 1024 -BYTES_MB = 1024 * 1024 - - -class LLMLimitReached(RuntimeError): - pass - - -class LLMTimeBudgetExceeded(RuntimeError): - pass - - -@dataclass -class AnswerScores: - confidence: int - relevance: int - satisfaction: int - hallucination_risk: str - - -@dataclass -class AnswerResult: - reply: str - scores: AnswerScores - meta: dict[str, Any] - - -@dataclass(frozen=True) -class InsightGuardInput: - question: str - reply: str - classify: dict[str, Any] - context: str - plan: "ModePlan" - call_llm: Callable[..., Awaitable[str]] - facts: list[str] - - -@dataclass -class ContradictionContext: - call_llm: Callable[..., Awaitable[str]] - question: str - reply: str - facts: list[str] - plan: "ModePlan" - - -@dataclass -class EvidenceItem: - path: str - reason: str - value: Any | None = None - value_at_claim: Any | None = None - - -@dataclass -class ClaimItem: - id: str - claim: str - evidence: list[EvidenceItem] - - -@dataclass -class ConversationState: - updated_at: float - claims: list[ClaimItem] - snapshot_id: str | None = None - snapshot: dict[str, Any] | None = None - - -@dataclass -class ModePlan: - model: str - fast_model: str - max_subquestions: int - chunk_lines: int - chunk_top: int - chunk_group: int - kb_max_chars: int - kb_max_files: int - use_raw_snapshot: bool - parallelism: int - score_retries: int - use_deep_retrieval: bool - use_tool: bool - use_critic: bool - use_gap: bool - use_scores: bool - drafts: int - metric_retries: int - subanswer_retries: int - - -@dataclass -class ScoreContext: - question: str - sub_questions: list[str] - retries: int - parallelism: int - select_best: bool - fast_model: str - - -class AnswerEngine: - def __init__( - self, - settings: Settings, - llm: LLMClient, - kb: KnowledgeBase, - snapshot: SnapshotProvider, - ) -> None: - self._settings = settings - self._llm = llm - self._kb = kb - self._snapshot = snapshot - self._store = ClaimStore(settings.state_db_path, settings.conversation_ttl_sec) - - async def answer( # noqa: C901, PLR0912, PLR0913, PLR0915 - self, - question: str, - *, - mode: str, - history: list[dict[str, str]] | None = None, - observer: Callable[[str, str], None] | None = None, - conversation_id: str | None = None, - snapshot_pin: bool | None = None, - ) -> AnswerResult: - question = (question or "").strip() - if not question: - return AnswerResult("I need a question to answer.", _default_scores(), {"mode": mode}) - if mode == "stock": - return await self._answer_stock(question) - - limitless = "run limitless" in question.lower() - if limitless: - question = re.sub(r"(?i)run limitless", "", question).strip() - plan = _mode_plan(self._settings, mode) - call_limit = _llm_call_limit(self._settings, mode) - call_cap = math.ceil(call_limit * self._settings.llm_limit_multiplier) - call_count = 0 - limit_hit = False - time_budget_hit = False - - debug_tags = { - "route", - "decompose", - "chunk_score", - "chunk_select", - "fact_select", - "synth", - "subanswer", - "tool", - "followup", - "select_claims", - "evidence_fix", - } - started = time.monotonic() - time_budget_sec = _mode_time_budget(self._settings, mode) if not limitless else 0.0 - - def _debug_log(name: str, payload: Any) -> None: - if not self._settings.debug_pipeline: - return - log.info("atlasbot_debug", extra={"extra": {"name": name, "payload": payload}}) - - async def call_llm(system: str, prompt: str, *, context: str | None = None, model: str | None = None, tag: str = "") -> str: - nonlocal call_count, limit_hit, time_budget_hit - if not limitless and call_count >= call_cap: - limit_hit = True - raise LLMLimitReached("llm_limit") - timeout_sec = None - if not limitless and time_budget_sec > 0: - time_left = time_budget_sec - (time.monotonic() - started) - if time_left <= 0: - time_budget_hit = True - raise LLMTimeBudgetExceeded("time_budget") - timeout_sec = min(self._settings.ollama_timeout_sec, time_left) - call_count += 1 - messages = build_messages(system, prompt, context=context) - try: - llm_call = self._llm.chat(messages, model=model or plan.model, timeout_sec=timeout_sec) - if timeout_sec is not None: - response = await asyncio.wait_for(llm_call, timeout=max(0.001, timeout_sec)) - else: - response = await llm_call - except asyncio.TimeoutError as exc: - time_budget_hit = True - raise LLMTimeBudgetExceeded("time_budget") from exc - log.info( - "atlasbot_llm_call", - extra={"extra": {"mode": mode, "tag": tag, "call": call_count, "limit": call_cap}}, - ) - if self._settings.debug_pipeline and tag in debug_tags: - _debug_log(f"llm_raw_{tag}", str(response)[:1200]) - return response - - state = self._get_state(conversation_id) - pin_snapshot = bool(snapshot_pin) or self._settings.snapshot_pin_enabled - snapshot = self._snapshot.get() - snapshot_used = snapshot - if pin_snapshot and state and state.snapshot: - snapshot_used = state.snapshot - summary = build_summary(snapshot_used) - allowed_nodes = _allowed_nodes(summary) - allowed_namespaces = _allowed_namespaces(summary) - summary_lines = _summary_lines(snapshot_used) - spine = _spine_from_summary(summary) or _spine_lines(summary_lines) - metric_tokens = _metric_key_tokens(summary_lines) - global_facts = _global_facts(summary_lines) - kb_summary = self._kb.summary() - runbooks = self._kb.runbook_titles(limit=6) - runbook_paths = self._kb.runbook_paths(limit=10) - history_ctx = _format_history(history) - lexicon_ctx = _lexicon_context(summary) - key_facts: list[str] = [] - metric_facts: list[str] = [] - facts_used: list[str] = [] - - reply = "" - scores = _default_scores() - claims: list[ClaimItem] = [] - classify: dict[str, Any] = {} - tool_hint: dict[str, Any] | None = None - try: - if mode in {"quick", "fast", "smart", "genius"} and not limitless: - if observer: - observer("factsheet", "building fact sheet") - if _is_plain_math_question(question): - reply = ( - "I focus on Titan cluster operations. Ask me about cluster health, nodes, workloads, " - "namespaces, storage, or alerts." - ) - scores = _default_scores() - meta = _build_meta( - mode, - call_count, - call_cap, - limit_hit, - time_budget_hit, - time_budget_sec, - classify, - tool_hint, - started, - ) - return AnswerResult(reply, scores, meta) - kb_lines = ( - self._kb.chunk_lines( - max_files=plan.kb_max_files, - max_chars=_factsheet_kb_chars(mode, plan.kb_max_chars), - ) - if self._kb - else [] - ) - fact_lines = _quick_fact_sheet_lines( - question, - summary_lines, - kb_lines, - limit=_factsheet_line_limit(mode), - ) - if observer: - observer("quick", "answering from fact sheet") - classify = { - "needs_snapshot": True, - "needs_kb": bool(kb_lines), - "question_type": f"{mode}_factsheet", - "answer_style": "direct" if mode in {"quick", "fast"} else "concise", - "follow_up": False, - } - heuristic_reply = _quick_fact_sheet_heuristic_answer(question, fact_lines) - if heuristic_reply: - reply = heuristic_reply - scores = _default_scores() - meta = _build_meta( - mode, - call_count, - call_cap, - limit_hit, - time_budget_hit, - time_budget_sec, - classify, - tool_hint, - started, - ) - return AnswerResult(reply, scores, meta) - quick_context = _quick_fact_sheet_text(fact_lines) - quick_prompt = ( - "Question: " - + question - + "\nAnswer using only the Fact Sheet. " - + _factsheet_instruction(mode) - ) - reply = await call_llm( - prompts.ANSWER_SYSTEM, - quick_prompt, - context=quick_context, - model=_factsheet_model(mode, plan), - tag=f"{mode}_factsheet", - ) - reply = _strip_followup_meta(reply) - scores = _default_scores() - meta = _build_meta( - mode, - call_count, - call_cap, - limit_hit, - time_budget_hit, - time_budget_sec, - classify, - tool_hint, - started, - ) - return AnswerResult(reply, scores, meta) - - if observer: - observer("normalize", "normalizing") - normalize_prompt = prompts.NORMALIZE_PROMPT + "\nQuestion: " + question - normalize_raw = await call_llm( - prompts.NORMALIZE_SYSTEM, - normalize_prompt, - context=lexicon_ctx, - model=plan.fast_model, - tag="normalize", - ) - normalize = _parse_json_block(normalize_raw, fallback={"normalized": question, "keywords": []}) - normalized = str(normalize.get("normalized") or question).strip() or question - keywords = normalize.get("keywords") or [] - _debug_log("normalize_parsed", {"normalized": normalized, "keywords": keywords}) - keyword_tokens = _extract_keywords(question, normalized, sub_questions=[], keywords=keywords) - question_tokens = _extract_question_tokens(normalized) - - if observer: - observer("route", "routing") - route_prompt = prompts.ROUTE_PROMPT + "\nQuestion: " + normalized + "\nKeywords: " + json.dumps(keywords) - route_raw = await call_llm( - prompts.ROUTE_SYSTEM, - route_prompt, - context=_join_context([kb_summary, lexicon_ctx]), - model=plan.fast_model, - tag="route", - ) - classify = _parse_json_block(route_raw, fallback={}) - classify.setdefault("needs_snapshot", True) - classify.setdefault("answer_style", "direct") - classify.setdefault("follow_up", False) - classify.setdefault("focus_entity", "unknown") - classify.setdefault("focus_metric", "unknown") - if metric_tokens and keyword_tokens and any(token in metric_tokens for token in keyword_tokens): - classify["needs_snapshot"] = True - intent = route_intent(normalized) - if intent: - classify["needs_snapshot"] = True - classify["question_type"] = "metric" - _debug_log("route_parsed", {"classify": classify, "normalized": normalized}) - lowered_question = f"{question} {normalized}".lower() - force_metric = bool(re.search(r"\bhow many\b|\bcount\b|\btotal\b", lowered_question)) - if any(term in lowered_question for term in ("postgres", "connections", "pvc", "ready")): - force_metric = True - - if intent: - spine_line = spine.get(intent.kind) if isinstance(spine, dict) else None - if not spine_line: - spine_line = _spine_fallback(intent, summary_lines) - spine_answer = _spine_answer(intent, spine_line) - if spine_line: - key_facts = _merge_fact_lines([spine_line], key_facts) - metric_facts = _merge_fact_lines([spine_line], metric_facts) - if spine_answer and mode in {"fast", "quick"}: - scores = _default_scores() - meta = _build_meta( - mode, - call_count, - call_cap, - limit_hit, - time_budget_hit, - time_budget_sec, - classify, - tool_hint, - started, - ) - return AnswerResult(spine_answer, scores, meta) - cluster_terms = ( - "atlas", - "cluster", - "node", - "nodes", - "namespace", - "pod", - "workload", - "k8s", - "kubernetes", - "postgres", - "database", - "db", - "connections", - "cpu", - "ram", - "memory", - "network", - "io", - "disk", - "pvc", - "storage", - ) - has_cluster_terms = any(term in lowered_question for term in cluster_terms) - if has_cluster_terms: - classify["needs_snapshot"] = True - lowered_norm = normalized.lower() - if ( - ("namespace" in lowered_norm and ("pod" in lowered_norm or "pods" in lowered_norm)) - or re.search(r"\bmost\s+pods\b", lowered_norm) - or re.search(r"\bpods\s+running\b", lowered_norm) - ): - classify["question_type"] = "metric" - classify["needs_snapshot"] = True - if re.search(r"\b(how many|count|number of|list)\b", lowered_question): - classify["question_type"] = "metric" - if any(term in lowered_question for term in ("postgres", "connections", "db")): - classify["question_type"] = "metric" - classify["needs_snapshot"] = True - if any(term in lowered_question for term in ("pvc", "persistentvolume", "persistent volume", "storage")): - if classify.get("question_type") not in {"metric", "diagnostic"}: - classify["question_type"] = "metric" - classify["needs_snapshot"] = True - if "ready" in lowered_question and classify.get("question_type") not in {"metric", "diagnostic"}: - classify["question_type"] = "diagnostic" - hottest_terms = ("hottest", "highest", "lowest", "most") - metric_terms = ("cpu", "ram", "memory", "net", "network", "io", "disk", "load", "usage", "pod", "pods", "namespace") - if any(term in lowered_question for term in hottest_terms) and any(term in lowered_question for term in metric_terms): - classify["question_type"] = "metric" - baseline_terms = ("baseline", "delta", "trend", "increase", "decrease", "drop", "spike", "regression", "change") - if any(term in lowered_question for term in baseline_terms) and any(term in lowered_question for term in metric_terms): - classify["question_type"] = "metric" - classify["needs_snapshot"] = True - - if not classify.get("follow_up") and state and state.claims: - follow_terms = ("there", "that", "those", "these", "it", "them", "that one", "this", "former", "latter") - is_metric_query = force_metric or classify.get("question_type") in {"metric", "diagnostic"} - if not is_metric_query: - if any(term in lowered_question for term in follow_terms): - classify["follow_up"] = True - elif len(normalized.split()) <= FOLLOWUP_SHORT_WORDS and not has_cluster_terms: - classify["follow_up"] = True - - if classify.get("follow_up") and state and state.claims: - if observer: - observer("followup", "answering follow-up") - reply = await self._answer_followup(question, state, summary, classify, plan, call_llm) - scores = await self._score_answer(question, reply, plan, call_llm) - meta = _build_meta( - mode, - call_count, - call_cap, - limit_hit, - time_budget_hit, - time_budget_sec, - classify, - tool_hint, - started, - ) - return AnswerResult(reply, scores, meta) - - if observer: - observer("decompose", "decomposing") - decompose_prompt = prompts.DECOMPOSE_PROMPT.format(max_parts=plan.max_subquestions * 2) - decompose_raw = await call_llm( - prompts.DECOMPOSE_SYSTEM, - decompose_prompt + "\nQuestion: " + normalized, - context=lexicon_ctx, - model=plan.fast_model if mode == "quick" else plan.model, - tag="decompose", - ) - parts = _parse_json_list(decompose_raw) - sub_questions = _select_subquestions(parts, normalized, plan.max_subquestions) - _debug_log("decompose_parsed", {"sub_questions": sub_questions}) - keyword_tokens = _extract_keywords(question, normalized, sub_questions=sub_questions, keywords=keywords) - focus_entity = str(classify.get("focus_entity") or "unknown").lower() - focus_metric = str(classify.get("focus_metric") or "unknown").lower() - lowered_q = f"{question} {normalized}".lower() - if "node" in lowered_q: - focus_entity = "node" - - snapshot_context = "" - signal_tokens: list[str] = [] - if classify.get("needs_snapshot"): - if observer: - observer("retrieve", "scoring chunks") - chunks = _chunk_lines(summary_lines, plan.chunk_lines) - if plan.use_raw_snapshot: - raw_chunks = _raw_snapshot_chunks(snapshot_used) - if raw_chunks: - chunks.extend(raw_chunks) - kb_lines = self._kb.chunk_lines(max_files=plan.kb_max_files, max_chars=plan.kb_max_chars) if self._kb else [] - if kb_lines: - kb_chunks = _chunk_lines(kb_lines, plan.chunk_lines) - for idx, chunk in enumerate(kb_chunks): - chunk["id"] = f"k{idx}" - chunks.extend(kb_chunks) - metric_keys: list[str] = [] - must_chunk_ids: list[str] = [] - metric_task = None - if (classify.get("question_type") in {"metric", "diagnostic"} or force_metric) and summary_lines: - metric_ctx = { - "question": normalized, - "sub_questions": sub_questions, - "keywords": keywords, - "keyword_tokens": keyword_tokens, - "summary_lines": summary_lines, - } - metric_task = asyncio.create_task(_select_metric_chunks(call_llm, metric_ctx, chunks, plan)) - scored_task = asyncio.create_task(_score_chunks(call_llm, chunks, normalized, sub_questions, plan)) - if metric_task: - metric_keys, must_chunk_ids = await metric_task - scored = await scored_task - selected = _select_chunks(chunks, scored, plan, keyword_tokens, must_chunk_ids) - fact_candidates = _collect_fact_candidates(selected, limit=plan.max_subquestions * 12) - key_facts = await _select_fact_lines( - call_llm, - normalized, - fact_candidates, - plan, - max_lines=max(4, plan.max_subquestions * 2), - ) - metric_facts: list[str] = [] - if classify.get("question_type") in {"metric", "diagnostic"} or force_metric: - global_metric_facts: list[str] = [] - if global_facts: - global_metric_facts = await _select_fact_lines( - call_llm, - normalized, - global_facts, - plan, - max_lines=min(2, max(1, plan.max_subquestions)), - ) - if not global_metric_facts and (keyword_tokens or question_tokens): - tokens = set(keyword_tokens or question_tokens) - tokens = {tok for tok in tokens if tok and tok not in GENERIC_METRIC_TOKENS} - global_metric_facts = _rank_metric_lines(global_facts, tokens, max_lines=2) - if global_metric_facts: - key_facts = _merge_fact_lines(global_metric_facts, key_facts) - all_tokens = _merge_tokens(signal_tokens, keyword_tokens, question_tokens) - if plan.use_deep_retrieval: - if observer: - observer("retrieve", "extracting fact types") - fact_types = await _extract_fact_types( - call_llm, - normalized, - keyword_tokens, - plan, - ) - if observer: - observer("retrieve", "deriving signals") - signals = await _derive_signals( - call_llm, - normalized, - fact_types, - plan, - ) - if isinstance(signals, list): - signal_tokens = [str(item) for item in signals if item] - all_tokens = _merge_tokens(signal_tokens, keyword_tokens, question_tokens) - if observer: - observer("retrieve", "scanning chunks") - candidate_lines: list[str] = [] - if signals: - for chunk in selected: - chunk_lines = chunk["text"].splitlines() - if not chunk_lines: - continue - hits = await _scan_chunk_for_signals( - call_llm, - normalized, - signals, - chunk_lines, - plan, - ) - if hits: - candidate_lines.extend(hits) - candidate_lines = list(dict.fromkeys(candidate_lines)) - if candidate_lines: - if observer: - observer("retrieve", "pruning candidates") - metric_facts = await _prune_metric_candidates( - call_llm, - normalized, - candidate_lines, - plan, - plan.metric_retries, - ) - if metric_facts: - key_facts = _merge_fact_lines(metric_facts, key_facts) - if self._settings.debug_pipeline: - _debug_log("metric_facts_selected", {"facts": metric_facts}) - if not metric_facts: - if observer: - observer("retrieve", "fallback metric selection") - token_set = {tok for tok in all_tokens if tok and tok not in GENERIC_METRIC_TOKENS} - fallback_candidates = _rank_metric_lines(summary_lines, token_set, max_lines=200) - if fallback_candidates: - metric_facts = await _select_fact_lines( - call_llm, - normalized, - fallback_candidates, - plan, - max_lines=max(2, plan.max_subquestions), - ) - if not metric_facts and fallback_candidates: - metric_facts = fallback_candidates[: max(2, plan.max_subquestions)] - if metric_keys: - key_lines = _lines_for_metric_keys(summary_lines, metric_keys, max_lines=plan.max_subquestions * 3) - if key_lines: - metric_facts = _merge_fact_lines(key_lines, metric_facts) - if metric_facts: - metric_cover_tokens = [tok for tok in keyword_tokens if tok and tok not in GENERIC_METRIC_TOKENS] - if not metric_cover_tokens: - metric_cover_tokens = [tok for tok in question_tokens if tok and tok not in GENERIC_METRIC_TOKENS] - metric_facts = _ensure_token_coverage( - metric_facts, - metric_cover_tokens or all_tokens, - summary_lines, - max_add=plan.max_subquestions, - ) - if metric_cover_tokens: - ranked_metric_lines = _rank_metric_lines( - summary_lines, - set(metric_cover_tokens), - max_lines=max(1, plan.max_subquestions), - ) - if ranked_metric_lines: - metric_facts = _merge_fact_lines(ranked_metric_lines, metric_facts) - if metric_facts and not _has_keyword_overlap(metric_facts, keyword_tokens): - best_line = _best_keyword_line(summary_lines, keyword_tokens) - if best_line: - metric_facts = _merge_fact_lines([best_line], metric_facts) - if metric_facts: - key_facts = _merge_fact_lines(metric_facts, key_facts) - if global_metric_facts: - metric_facts = _merge_fact_lines(global_metric_facts, metric_facts) - if (classify.get("question_type") in {"metric", "diagnostic"} or force_metric) and not metric_facts and key_facts: - metric_facts = key_facts - if key_facts: - key_facts = _ensure_token_coverage( - key_facts, - all_tokens, - summary_lines, - max_add=plan.max_subquestions, - ) - if self._settings.debug_pipeline: - scored_preview = sorted( - [{"id": c["id"], "score": scored.get(c["id"], 0.0), "summary": c["summary"]} for c in chunks], - key=lambda item: item["score"], - reverse=True, - )[: min(len(chunks), max(plan.chunk_top, 6))] - _debug_log( - "chunk_selected", - { - "selected_ids": [item["id"] for item in selected], - "top_scored": scored_preview, - "metric_keys": metric_keys, - "forced_chunks": must_chunk_ids, - }, - ) - facts_used = list(dict.fromkeys(key_facts)) if key_facts else list(dict.fromkeys(metric_facts)) - snapshot_context = "ClusterSnapshot:\n" + "\n".join([chunk["text"] for chunk in selected]) - combined_facts = key_facts - if global_facts: - combined_facts = _merge_fact_lines(global_facts, key_facts) - if combined_facts: - snapshot_context = "KeyFacts:\n" + "\n".join(combined_facts) + "\n\n" + snapshot_context - - context = _join_context( - [kb_summary, _format_runbooks(runbooks), snapshot_context, history_ctx if classify.get("follow_up") else ""] - ) - - if plan.use_tool and classify.get("needs_tool"): - if observer: - observer("tool", "suggesting tools") - tool_prompt = prompts.TOOL_PROMPT + "\nQuestion: " + normalized - tool_raw = await call_llm(prompts.TOOL_SYSTEM, tool_prompt, context=context, model=plan.fast_model, tag="tool") - tool_hint = _parse_json_block(tool_raw, fallback={}) - - if observer: - observer("subanswers", "drafting subanswers") - subanswers: list[str] = [] - async def _subanswer_for(subq: str) -> str: - sub_prompt = prompts.SUBANSWER_PROMPT + "\nQuestion: " + subq - if plan.subanswer_retries > 1: - candidates = await _gather_limited( - [ - call_llm( - prompts.ANSWER_SYSTEM, - sub_prompt, - context=context, - model=plan.model, - tag="subanswer", - ) - for _ in range(plan.subanswer_retries) - ], - plan.parallelism, - ) - best_idx = await _select_best_candidate(call_llm, subq, candidates, plan, "subanswer_select") - return candidates[best_idx] - return await call_llm( - prompts.ANSWER_SYSTEM, - sub_prompt, - context=context, - model=plan.model, - tag="subanswer", - ) - - if plan.parallelism > 1 and len(sub_questions) > 1: - subanswers = await _gather_limited( - [_subanswer_for(subq) for subq in sub_questions], - plan.parallelism, - ) - else: - for subq in sub_questions: - subanswers.append(await _subanswer_for(subq)) - - if observer: - observer("synthesize", "synthesizing") - reply = await self._synthesize_answer(normalized, subanswers, context, classify, plan, call_llm) - - unknown_nodes = _find_unknown_nodes(reply, allowed_nodes) - unknown_namespaces = _find_unknown_namespaces(reply, allowed_namespaces) - runbook_fix = _needs_runbook_fix(reply, runbook_paths) - runbook_needed = _needs_runbook_reference(normalized, runbook_paths, reply) - needs_evidence = _needs_evidence_fix(reply, classify) - hardware_terms = ("rpi", "raspberry", "jetson", "amd64", "arm64", "hardware") - hardware_line = _line_starting_with(summary_lines, "hardware_nodes:") - if any(term in lowered_question for term in hardware_terms) and hardware_line: - needs_evidence = True - if metric_facts and (classify.get("question_type") in {"metric", "diagnostic"} or force_metric): - if not _reply_matches_metric_facts(reply, metric_facts, all_tokens): - needs_evidence = True - if classify.get("question_type") in {"open_ended", "planning"} and metric_facts: - needs_evidence = True - resolved_runbook = None - if runbook_paths and (runbook_fix or runbook_needed): - resolver_prompt = prompts.RUNBOOK_SELECT_PROMPT + "\nQuestion: " + normalized - resolver_raw = await call_llm( - prompts.RUNBOOK_SELECT_SYSTEM, - resolver_prompt, - context="AllowedRunbooks:\n" + "\n".join(runbook_paths), - model=plan.fast_model, - tag="runbook_select", - ) - resolver = _parse_json_block(resolver_raw, fallback={}) - candidate = resolver.get("path") if isinstance(resolver.get("path"), str) else None - if candidate and candidate in runbook_paths: - resolved_runbook = candidate - if (snapshot_context and needs_evidence) or unknown_nodes or unknown_namespaces or runbook_fix or runbook_needed: - if observer: - observer("evidence_fix", "repairing missing evidence") - extra_bits = [] - if unknown_nodes: - extra_bits.append("UnknownNodes: " + ", ".join(sorted(unknown_nodes))) - if unknown_namespaces: - extra_bits.append("UnknownNamespaces: " + ", ".join(sorted(unknown_namespaces))) - if runbook_paths: - extra_bits.append("AllowedRunbooks: " + ", ".join(runbook_paths)) - if resolved_runbook: - extra_bits.append("ResolvedRunbook: " + resolved_runbook) - if metric_facts: - extra_bits.append("MustUseFacts: " + "; ".join(metric_facts[:4])) - if hardware_line: - extra_bits.append("HardwareNodes: " + hardware_line) - if allowed_nodes: - extra_bits.append("AllowedNodes: " + ", ".join(allowed_nodes)) - if allowed_namespaces: - extra_bits.append("AllowedNamespaces: " + ", ".join(allowed_namespaces)) - fix_prompt = ( - prompts.EVIDENCE_FIX_PROMPT - + "\nQuestion: " - + normalized - + "\nDraft: " - + reply - + ("\n" + "\n".join(extra_bits) if extra_bits else "") - ) - reply = await call_llm( - prompts.EVIDENCE_FIX_SYSTEM, - fix_prompt, - context=context, - model=plan.model, - tag="evidence_fix", - ) - if metric_facts and not _reply_matches_metric_facts(reply, metric_facts, all_tokens): - enforce_prompt = ( - prompts.EVIDENCE_FIX_PROMPT - + "\nQuestion: " - + normalized - + "\nDraft: " - + reply - + "\nMustIncludeFacts: " - + "; ".join(metric_facts[:6]) - + "\nInstruction: The answer must include all MustIncludeFacts items." - ) - reply = await call_llm( - prompts.EVIDENCE_FIX_SYSTEM, - enforce_prompt, - context=context, - model=plan.model, - tag="evidence_fix_enforce", - ) - if metric_facts and not _reply_matches_metric_facts(reply, metric_facts, all_tokens): - direct_candidates = [] - if metric_keys: - direct_candidates = _lines_for_metric_keys( - summary_lines, - metric_keys, - max_lines=plan.max_subquestions * 3, - ) - if not direct_candidates: - direct_candidates = summary_lines - direct_line = _select_metric_line(direct_candidates, normalized, all_tokens) - if direct_line: - direct_prompt = f"Question: {normalized}\nFact: {direct_line}\nAnswer using the fact." - reply = await call_llm( - prompts.ANSWER_SYSTEM, - direct_prompt, - context="", - model=plan.fast_model, - tag="metric_direct", - ) - if mode == "quick" and any(term in normalized.lower() for term in ("how many", "count", "total")): - reply = _format_direct_metric_line(direct_line) - elif not _reply_matches_metric_facts(reply, [direct_line], all_tokens): - reply = _format_direct_metric_line(direct_line) - - if "raspberry" in lowered_question and "not" in lowered_question: - non_rpi = _non_rpi_nodes(summary) - if non_rpi: - reply = _format_hardware_groups(non_rpi, "Non-Raspberry Pi nodes") - if unknown_nodes or unknown_namespaces: - refreshed_nodes = _find_unknown_nodes(reply, allowed_nodes) - refreshed_namespaces = _find_unknown_namespaces(reply, allowed_namespaces) - if refreshed_nodes or refreshed_namespaces: - reply = _strip_unknown_entities(reply, refreshed_nodes, refreshed_namespaces) - if runbook_paths and resolved_runbook and _needs_runbook_reference(normalized, runbook_paths, reply): - if observer: - observer("runbook_enforce", "enforcing runbook path") - enforce_prompt = prompts.RUNBOOK_ENFORCE_PROMPT.format(path=resolved_runbook) - reply = await call_llm( - prompts.RUNBOOK_ENFORCE_SYSTEM, - enforce_prompt + "\nAnswer: " + reply, - context=context, - model=plan.model, - tag="runbook_enforce", - ) - if runbook_paths: - invalid = [ - token - for token in re.findall(r"runbooks/[A-Za-z0-9._-]+", reply) - if token.lower() not in {p.lower() for p in runbook_paths} - ] - if invalid: - if observer: - observer("runbook_enforce", "replacing invalid runbook path") - resolver_prompt = prompts.RUNBOOK_SELECT_PROMPT + "\nQuestion: " + normalized - resolver_raw = await call_llm( - prompts.RUNBOOK_SELECT_SYSTEM, - resolver_prompt, - context="AllowedRunbooks:\n" + "\n".join(runbook_paths), - model=plan.fast_model, - tag="runbook_select", - ) - resolver = _parse_json_block(resolver_raw, fallback={}) - candidate = resolver.get("path") if isinstance(resolver.get("path"), str) else None - if not (candidate and candidate in runbook_paths): - candidate = _best_runbook_match(invalid[0], runbook_paths) - if candidate and candidate in runbook_paths: - enforce_prompt = prompts.RUNBOOK_ENFORCE_PROMPT.format(path=candidate) - reply = await call_llm( - prompts.RUNBOOK_ENFORCE_SYSTEM, - enforce_prompt + "\nAnswer: " + reply, - context=context, - model=plan.model, - tag="runbook_enforce", - ) - reply = _strip_unknown_entities(reply, unknown_nodes, unknown_namespaces) - - if facts_used and _needs_evidence_guard(reply, facts_used): - if observer: - observer("evidence_guard", "tightening unsupported claims") - use_guard = True - if mode in {"smart", "genius"}: - decision = await _contradiction_decision( - ContradictionContext(call_llm, normalized, reply, facts_used, plan), - attempts=3 if mode == "genius" else 1, - ) - use_guard = decision.get("use_facts", True) - if use_guard: - guard_prompt = ( - prompts.EVIDENCE_GUARD_PROMPT - + "\nQuestion: " - + normalized - + "\nDraft: " - + reply - + "\nFactsUsed:\n" - + "\n".join(facts_used) - ) - reply = await call_llm( - prompts.EVIDENCE_GUARD_SYSTEM, - guard_prompt, - context=context, - model=plan.model, - tag="evidence_guard", - ) - - if _needs_focus_fix(normalized, reply, classify): - if observer: - observer("focus_fix", "tightening answer") - reply = await call_llm( - prompts.EVIDENCE_FIX_SYSTEM, - prompts.FOCUS_FIX_PROMPT + "\nQuestion: " + normalized + "\nDraft: " + reply, - context=context, - model=plan.model, - tag="focus_fix", - ) - if not metric_facts or not _has_keyword_overlap(metric_facts, keyword_tokens): - best_line = _best_keyword_line(summary_lines, keyword_tokens) - if best_line: - reply = f"Latest metrics: {best_line}." - if (classify.get("question_type") in {"metric", "diagnostic"} or force_metric) and metric_facts: - best_line = None - lowered_keywords = [kw.lower() for kw in keyword_tokens if kw] - for line in metric_facts: - line_lower = line.lower() - if any(kw in line_lower for kw in lowered_keywords): - best_line = line - break - best_line = best_line or metric_facts[0] - reply_numbers = set(re.findall(r"\d+(?:\.\d+)?", reply)) - fact_numbers = set(re.findall(r"\d+(?:\.\d+)?", " ".join(metric_facts))) - if not reply_numbers or (fact_numbers and not (reply_numbers & fact_numbers)): - reply = f"Latest metrics: {best_line}." - - if _should_use_insight_guard(classify): - if observer: - observer("insight_guard", "checking for concrete signals") - reply = await _apply_insight_guard( - InsightGuardInput( - question=normalized, - reply=reply, - classify=classify, - context=context, - plan=plan, - call_llm=call_llm, - facts=metric_facts or key_facts, - ) - ) - - if plan.use_critic: - if observer: - observer("critic", "reviewing") - critic_prompt = prompts.CRITIC_PROMPT + "\nQuestion: " + normalized + "\nAnswer: " + reply - critic_raw = await call_llm(prompts.CRITIC_SYSTEM, critic_prompt, context=context, model=plan.model, tag="critic") - critic = _parse_json_block(critic_raw, fallback={}) - if critic.get("issues"): - revise_prompt = ( - prompts.REVISION_PROMPT - + "\nQuestion: " - + normalized - + "\nDraft: " - + reply - + "\nCritique: " - + json.dumps(critic) - ) - reply = await call_llm(prompts.REVISION_SYSTEM, revise_prompt, context=context, model=plan.model, tag="revise") - - if plan.use_gap: - if observer: - observer("gap", "checking gaps") - gap_prompt = prompts.EVIDENCE_GAP_PROMPT + "\nQuestion: " + normalized + "\nAnswer: " + reply - gap_raw = await call_llm(prompts.GAP_SYSTEM, gap_prompt, context=context, model=plan.fast_model, tag="gap") - gap = _parse_json_block(gap_raw, fallback={}) - note = str(gap.get("note") or "").strip() - if note: - reply = f"{reply}\n\n{note}" - - - reply = await self._dedup_reply(reply, plan, call_llm, tag="dedup") - - scores = await self._score_answer(normalized, reply, plan, call_llm) - claims = await self._extract_claims(normalized, reply, summary, facts_used, call_llm) - except LLMTimeBudgetExceeded: - time_budget_hit = True - if not reply: - budget = max(1, int(round(time_budget_sec))) if time_budget_sec > 0 else 0 - if mode in {"quick", "fast"}: - budget_text = f"{budget}s" if budget else "its configured" - reply = ( - f"Quick mode hit {budget_text} time budget before finishing. " - "Try atlas-smart for a deeper answer." - ) - elif mode == "smart": - budget_text = f"{budget}s" if budget else "its configured" - reply = ( - f"Smart mode hit {budget_text} time budget before finishing. " - "Try atlas-genius or ask a narrower follow-up." - ) - else: - reply = "I ran out of time before I could finish this answer." - scores = _default_scores() - except LLMLimitReached: - if not reply: - reply = "I started working on this but hit my reasoning limit. Ask again with 'Run limitless' for a deeper pass." - scores = _default_scores() - finally: - elapsed = round(time.monotonic() - started, 2) - log.info( - "atlasbot_answer", - extra={ - "extra": { - "mode": mode, - "seconds": elapsed, - "llm_calls": call_count, - "limit": call_cap, - "limit_hit": limit_hit, - "time_budget_sec": time_budget_sec, - "time_budget_hit": time_budget_hit, - } - }, - ) - - if limit_hit and "run limitless" not in reply.lower(): - reply = reply.rstrip() + "\n\nNote: I hit my reasoning limit. Ask again with 'Run limitless' for a deeper pass." - - if conversation_id and claims: - self._store_state(conversation_id, claims, summary, snapshot_used, pin_snapshot) - - meta = _build_meta( - mode, - call_count, - call_cap, - limit_hit, - time_budget_hit, - time_budget_sec, - classify, - tool_hint, - started, - ) - return AnswerResult(reply, scores, meta) - - async def _answer_stock(self, question: str) -> AnswerResult: - messages = build_messages(prompts.STOCK_SYSTEM, question) - reply = await self._llm.chat(messages, model=self._settings.ollama_model) - return AnswerResult(reply, _default_scores(), {"mode": "stock"}) - - async def _synthesize_answer( # noqa: PLR0913 - self, - question: str, - subanswers: list[str], - context: str, - classify: dict[str, Any], - plan: ModePlan, - call_llm: Callable[..., Any], - ) -> str: - style_hint = _style_hint(classify) - if not subanswers: - prompt = ( - prompts.SYNTHESIZE_PROMPT - + "\nQuestion: " - + question - + "\nStyle: " - + style_hint - + "\nQuestionType: " - + (classify.get("question_type") or "unknown") - ) - return await call_llm(prompts.SYNTHESIZE_SYSTEM, prompt, context=context, model=plan.model, tag="synth") - draft_prompts = [] - for idx in range(plan.drafts): - draft_prompts.append( - prompts.SYNTHESIZE_PROMPT - + "\nQuestion: " - + question - + "\nStyle: " - + style_hint - + "\nQuestionType: " - + (classify.get("question_type") or "unknown") - + "\nSubanswers:\n" - + "\n".join([f"- {item}" for item in subanswers]) - + f"\nDraftIndex: {idx + 1}" - ) - drafts: list[str] = [] - if plan.parallelism > 1 and len(draft_prompts) > 1: - drafts = await _gather_limited( - [ - call_llm( - prompts.SYNTHESIZE_SYSTEM, - prompt, - context=context, - model=plan.model, - tag="synth", - ) - for prompt in draft_prompts - ], - plan.parallelism, - ) - else: - for prompt in draft_prompts: - drafts.append( - await call_llm( - prompts.SYNTHESIZE_SYSTEM, - prompt, - context=context, - model=plan.model, - tag="synth", - ) - ) - if len(drafts) == 1: - return drafts[0] - select_prompt = ( - prompts.DRAFT_SELECT_PROMPT - + "\nQuestion: " - + question - + "\nDrafts:\n" - + "\n\n".join([f"Draft {idx + 1}: {text}" for idx, text in enumerate(drafts)]) - ) - select_raw = await call_llm(prompts.CRITIC_SYSTEM, select_prompt, context=context, model=plan.fast_model, tag="draft_select") - selection = _parse_json_block(select_raw, fallback={}) - idx = int(selection.get("best", 1)) - 1 - if 0 <= idx < len(drafts): - return drafts[idx] - return drafts[0] - - async def _score_answer( - self, - question: str, - reply: str, - plan: ModePlan, - call_llm: Callable[..., Any], - ) -> AnswerScores: - if not plan.use_scores: - return _default_scores() - prompt = prompts.SCORE_PROMPT + "\nQuestion: " + question + "\nAnswer: " + reply - raw = await call_llm(prompts.SCORE_SYSTEM, prompt, model=plan.fast_model, tag="score") - data = _parse_json_block(raw, fallback={}) - return _scores_from_json(data) - - async def _extract_claims( - self, - question: str, - reply: str, - summary: dict[str, Any], - facts_used: list[str], - call_llm: Callable[..., Any], - ) -> list[ClaimItem]: - if not reply or not summary: - return [] - summary_json = _json_excerpt(summary) - facts_used = [line.strip() for line in (facts_used or []) if line and line.strip()] - facts_block = "" - if facts_used: - facts_block = "\nFactsUsed:\n" + "\n".join([f"- {line}" for line in facts_used[:12]]) - prompt = prompts.CLAIM_MAP_PROMPT + "\nQuestion: " + question + "\nAnswer: " + reply + facts_block - raw = await call_llm( - prompts.CLAIM_SYSTEM, - prompt, - context=f"SnapshotSummaryJson:{summary_json}", - model=self._settings.ollama_model_fast, - tag="claim_map", - ) - data = _parse_json_block(raw, fallback={}) - claims_raw = data.get("claims") if isinstance(data, dict) else None - claims: list[ClaimItem] = [] - if isinstance(claims_raw, list): - for entry in claims_raw: - if not isinstance(entry, dict): - continue - claim_text = str(entry.get("claim") or "").strip() - claim_id = str(entry.get("id") or "").strip() or f"c{len(claims)+1}" - evidence_items: list[EvidenceItem] = [] - for ev in entry.get("evidence") or []: - if not isinstance(ev, dict): - continue - path = str(ev.get("path") or "").strip() - if not path: - continue - reason = str(ev.get("reason") or "").strip() - value = _resolve_path(summary, path) - evidence_items.append(EvidenceItem(path=path, reason=reason, value=value, value_at_claim=value)) - if claim_text and evidence_items: - claims.append(ClaimItem(id=claim_id, claim=claim_text, evidence=evidence_items)) - return claims - - async def _dedup_reply( - self, - reply: str, - plan: ModePlan, - call_llm: Callable[..., Any], - tag: str, - ) -> str: - if not _needs_dedup(reply): - return reply - dedup_prompt = prompts.DEDUP_PROMPT + "\nDraft: " + reply - return await call_llm(prompts.DEDUP_SYSTEM, dedup_prompt, model=plan.fast_model, tag=tag) - - async def _answer_followup( # noqa: C901, PLR0913 - self, - question: str, - state: ConversationState, - summary: dict[str, Any], - classify: dict[str, Any], - plan: ModePlan, - call_llm: Callable[..., Any], - ) -> str: - claim_ids = await self._select_claims(question, state.claims, plan, call_llm) - selected = [claim for claim in state.claims if claim.id in claim_ids] if claim_ids else state.claims[:2] - evidence_lines = [] - lowered = question.lower() - for claim in selected: - evidence_lines.append(f"Claim: {claim.claim}") - for ev in claim.evidence: - current = _resolve_path(summary, ev.path) - ev.value = current - delta_note = "" - if ev.value_at_claim is not None and current is not None and current != ev.value_at_claim: - delta_note = f" (now {current})" - evidence_lines.append(f"- {ev.path}: {ev.value_at_claim}{delta_note}") - if any(term in lowered for term in ("hotspot", "hot spot", "hottest", "jetson", "rpi", "amd64", "arm64", "hardware", "class")): - hotspot_lines = _hotspot_evidence(summary) - if hotspot_lines: - evidence_lines.append("HotspotSummary:") - evidence_lines.extend(hotspot_lines) - evidence_ctx = "\n".join(evidence_lines) - prompt = prompts.FOLLOWUP_PROMPT + "\nFollow-up: " + question + "\nEvidence:\n" + evidence_ctx - reply = await call_llm(prompts.FOLLOWUP_SYSTEM, prompt, model=plan.model, tag="followup") - allowed_nodes = _allowed_nodes(summary) - allowed_namespaces = _allowed_namespaces(summary) - unknown_nodes = _find_unknown_nodes(reply, allowed_nodes) - unknown_namespaces = _find_unknown_namespaces(reply, allowed_namespaces) - extra_bits = [] - if unknown_nodes: - extra_bits.append("UnknownNodes: " + ", ".join(sorted(unknown_nodes))) - if unknown_namespaces: - extra_bits.append("UnknownNamespaces: " + ", ".join(sorted(unknown_namespaces))) - if allowed_nodes: - extra_bits.append("AllowedNodes: " + ", ".join(allowed_nodes)) - if allowed_namespaces: - extra_bits.append("AllowedNamespaces: " + ", ".join(allowed_namespaces)) - if extra_bits: - fix_prompt = ( - prompts.EVIDENCE_FIX_PROMPT - + "\nQuestion: " - + question - + "\nDraft: " - + reply - + "\n" - + "\n".join(extra_bits) - ) - reply = await call_llm( - prompts.EVIDENCE_FIX_SYSTEM, - fix_prompt, - context="Evidence:\n" + evidence_ctx, - model=plan.model, - tag="followup_fix", - ) - reply = await self._dedup_reply(reply, plan, call_llm, tag="dedup_followup") - reply = _strip_followup_meta(reply) - return reply - - async def _select_claims( - self, - question: str, - claims: list[ClaimItem], - plan: ModePlan, - call_llm: Callable[..., Any], - ) -> list[str]: - if not claims: - return [] - claims_brief = [{"id": claim.id, "claim": claim.claim} for claim in claims] - prompt = prompts.SELECT_CLAIMS_PROMPT + "\nFollow-up: " + question + "\nClaims: " + json.dumps(claims_brief) - raw = await call_llm(prompts.FOLLOWUP_SYSTEM, prompt, model=plan.fast_model, tag="select_claims") - data = _parse_json_block(raw, fallback={}) - ids = data.get("claim_ids") if isinstance(data, dict) else [] - if isinstance(ids, list): - return [str(item) for item in ids if item] - return [] - - def _get_state(self, conversation_id: str | None) -> ConversationState | None: - if not conversation_id: - return None - state_payload = self._store.get(conversation_id) - return _state_from_payload(state_payload) if state_payload else None - - def _store_state( - self, - conversation_id: str, - claims: list[ClaimItem], - summary: dict[str, Any], - snapshot: dict[str, Any] | None, - pin_snapshot: bool, - ) -> None: - snapshot_id = _snapshot_id(summary) - pinned_snapshot = snapshot if pin_snapshot else None - payload = { - "updated_at": time.monotonic(), - "claims": _claims_to_payload(claims), - "snapshot_id": snapshot_id, - "snapshot": pinned_snapshot, - } - self._store.set(conversation_id, payload) - - def _cleanup_state(self) -> None: - self._store.cleanup() - - -def _strip_followup_meta(reply: str) -> str: - cleaned = reply.strip() - if not cleaned: - return cleaned - prefixes = [ - "The draft is correct based on the provided context.", - "The draft is correct based on the context.", - "The draft is correct based on the provided evidence.", - "The draft is correct.", - "Based on the provided context,", - "Based on the context,", - "Based on the provided evidence,", - ] - for prefix in prefixes: - if cleaned.lower().startswith(prefix.lower()): - cleaned = cleaned[len(prefix) :].lstrip(" .") - break - return cleaned - - -def _build_meta( # noqa: PLR0913 - mode: str, - call_count: int, - call_cap: int, - limit_hit: bool, - time_budget_hit: bool, - time_budget_sec: float, - classify: dict[str, Any], - tool_hint: dict[str, Any] | None, - started: float, -) -> dict[str, Any]: - return { - "mode": mode, - "llm_calls": call_count, - "llm_limit": call_cap, - "llm_limit_hit": limit_hit, - "time_budget_sec": time_budget_sec, - "time_budget_hit": time_budget_hit, - "classify": classify, - "tool_hint": tool_hint, - "elapsed_sec": round(time.monotonic() - started, 2), - } - - -def _mode_plan(settings: Settings, mode: str) -> ModePlan: - if mode == "genius": - return ModePlan( - model=settings.ollama_model_genius, - fast_model=settings.ollama_model_fast, - max_subquestions=6, - chunk_lines=6, - chunk_top=10, - chunk_group=4, - kb_max_chars=200000, - kb_max_files=200, - use_raw_snapshot=True, - parallelism=4, - score_retries=3, - use_deep_retrieval=True, - use_tool=True, - use_critic=True, - use_gap=True, - use_scores=True, - drafts=2, - metric_retries=3, - subanswer_retries=3, - ) - if mode == "smart": - return ModePlan( - model=settings.ollama_model_smart, - fast_model=settings.ollama_model_fast, - max_subquestions=4, - chunk_lines=8, - chunk_top=8, - chunk_group=4, - kb_max_chars=3000, - kb_max_files=12, - use_raw_snapshot=False, - parallelism=2, - score_retries=2, - use_deep_retrieval=True, - use_tool=True, - use_critic=True, - use_gap=True, - use_scores=True, - drafts=1, - metric_retries=2, - subanswer_retries=2, - ) - return ModePlan( - model=settings.ollama_model_fast, - fast_model=settings.ollama_model_fast, - max_subquestions=1, - chunk_lines=16, - chunk_top=3, - chunk_group=5, - kb_max_chars=800, - kb_max_files=4, - use_raw_snapshot=False, - parallelism=1, - score_retries=1, - use_deep_retrieval=False, - use_tool=False, - use_critic=False, - use_gap=False, - use_scores=False, - drafts=1, - metric_retries=1, - subanswer_retries=1, - ) - - -def _llm_call_limit(settings: Settings, mode: str) -> int: - if mode == "genius": - return settings.genius_llm_calls_max - if mode == "smart": - return settings.smart_llm_calls_max - return settings.fast_llm_calls_max - - -def _mode_time_budget(settings: Settings, mode: str) -> float: - if mode == "genius": - return max(0.0, settings.genius_time_budget_sec) - if mode == "smart": - return max(0.0, settings.smart_time_budget_sec) - return max(0.0, settings.quick_time_budget_sec) - - -def _select_subquestions(parts: list[dict[str, Any]], fallback: str, limit: int) -> list[str]: - if not parts: - return [fallback] - ranked = [] - for entry in parts: - if not isinstance(entry, dict): - continue - question = str(entry.get("question") or "").strip() - if not question: - continue - priority = entry.get("priority") - try: - weight = float(priority) - except (TypeError, ValueError): - weight = 1.0 - ranked.append((weight, question)) - ranked.sort(key=lambda item: item[0], reverse=True) - questions = [item[1] for item in ranked][:limit] - return questions or [fallback] - - -def _chunk_lines(lines: list[str], lines_per_chunk: int) -> list[dict[str, Any]]: - chunks: list[dict[str, Any]] = [] - if not lines: - return chunks - for idx in range(0, len(lines), lines_per_chunk): - chunk_lines = lines[idx : idx + lines_per_chunk] - text = "\n".join(chunk_lines) - summary = " | ".join(chunk_lines[:4]) - chunks.append({"id": f"c{idx//lines_per_chunk}", "text": text, "summary": summary}) - return chunks - - -def _raw_snapshot_chunks(snapshot: dict[str, Any] | None) -> list[dict[str, Any]]: - if not isinstance(snapshot, dict) or not snapshot: - return [] - chunks: list[dict[str, Any]] = [] - for key, value in snapshot.items(): - try: - payload = json.dumps({key: value}, indent=2) - except Exception: - continue - summary = f"raw:{key}" - chunks.append({"id": f"r{key}", "text": payload, "summary": summary}) - return chunks - - -def _build_chunk_groups(chunks: list[dict[str, Any]], group_size: int) -> list[list[dict[str, Any]]]: - groups: list[list[dict[str, Any]]] = [] - group: list[dict[str, Any]] = [] - for chunk in chunks: - group.append({"id": chunk["id"], "summary": chunk["summary"]}) - if len(group) >= group_size: - groups.append(group) - group = [] - if group: - groups.append(group) - return groups - - -async def _score_chunks( - call_llm: Callable[..., Any], - chunks: list[dict[str, Any]], - question: str, - sub_questions: list[str], - plan: ModePlan, -) -> dict[str, float]: - scores: dict[str, float] = {chunk["id"]: 0.0 for chunk in chunks} - if not chunks: - return scores - groups = _build_chunk_groups(chunks, plan.chunk_group) - ctx = ScoreContext( - question=question, - sub_questions=sub_questions, - retries=max(1, plan.score_retries), - parallelism=plan.parallelism, - select_best=plan.score_retries > 1, - fast_model=plan.fast_model, - ) - if ctx.parallelism <= 1 or len(groups) * ctx.retries <= 1: - return await _score_groups_serial(call_llm, groups, ctx) - return await _score_groups_parallel(call_llm, groups, ctx) - - -async def _score_groups_serial( - call_llm: Callable[..., Any], - groups: list[list[dict[str, Any]]], - ctx: ScoreContext, -) -> dict[str, float]: - scores: dict[str, float] = {} - for grp in groups: - runs = [await _score_chunk_group(call_llm, grp, ctx.question, ctx.sub_questions) for _ in range(ctx.retries)] - if ctx.select_best and len(runs) > 1: - best = await _select_best_score_run(call_llm, grp, runs, ctx) - scores.update(best) - else: - scores.update(_merge_score_runs(runs)) - return scores - - -async def _score_groups_parallel( - call_llm: Callable[..., Any], - groups: list[list[dict[str, Any]]], - ctx: ScoreContext, -) -> dict[str, float]: - coros: list[Awaitable[tuple[int, dict[str, float]]]] = [] - for idx, grp in enumerate(groups): - for _ in range(ctx.retries): - coros.append(_score_chunk_group_run(call_llm, idx, grp, ctx.question, ctx.sub_questions)) - results = await _gather_limited(coros, ctx.parallelism) - grouped: dict[int, list[dict[str, float]]] = {} - for idx, result in results: - grouped.setdefault(idx, []).append(result) - scores: dict[str, float] = {} - for idx, runs in grouped.items(): - if ctx.select_best and len(runs) > 1: - group = groups[idx] - best = await _select_best_score_run(call_llm, group, runs, ctx) - scores.update(best) - else: - scores.update(_merge_score_runs(runs)) - return scores - - -async def _score_chunk_group( - call_llm: Callable[..., Any], - group: list[dict[str, Any]], - question: str, - sub_questions: list[str], -) -> dict[str, float]: - prompt = ( - prompts.CHUNK_SCORE_PROMPT - + "\nQuestion: " - + question - + "\nSubQuestions: " - + json.dumps(sub_questions) - + "\nChunks: " - + json.dumps(group) - ) - raw = await call_llm(prompts.RETRIEVER_SYSTEM, prompt, model=None, tag="chunk_score") - data = _parse_json_list(raw) - scored: dict[str, float] = {} - for entry in data: - if not isinstance(entry, dict): - continue - cid = str(entry.get("id") or "").strip() - if not cid: - continue - try: - score = float(entry.get("score") or 0) - except (TypeError, ValueError): - score = 0.0 - scored[cid] = score - return scored - - -async def _score_chunk_group_run( - call_llm: Callable[..., Any], - idx: int, - group: list[dict[str, Any]], - question: str, - sub_questions: list[str], -) -> tuple[int, dict[str, float]]: - return idx, await _score_chunk_group(call_llm, group, question, sub_questions) - - -def _merge_score_runs(runs: list[dict[str, float]]) -> dict[str, float]: - if not runs: - return {} - totals: dict[str, float] = {} - counts: dict[str, int] = {} - for run in runs: - for key, value in run.items(): - totals[key] = totals.get(key, 0.0) + float(value) - counts[key] = counts.get(key, 0) + 1 - return {key: totals[key] / counts[key] for key in totals} - - -async def _select_best_score_run( - call_llm: Callable[..., Any], - group: list[dict[str, Any]], - runs: list[dict[str, float]], - ctx: ScoreContext, -) -> dict[str, float]: - if not runs: - return {} - prompt = ( - prompts.RETRIEVER_SELECT_PROMPT - + "\nQuestion: " - + ctx.question - + "\nSubQuestions: " - + json.dumps(ctx.sub_questions) - + "\nChunks: " - + json.dumps(group) - + "\nRuns: " - + json.dumps(runs) - ) - raw = await call_llm(prompts.RETRIEVER_SELECT_SYSTEM, prompt, model=ctx.fast_model, tag="chunk_select") - data = parse_json(raw) - idx = 0 - if isinstance(data, dict): - try: - idx = int(data.get("selected_index") or 0) - except (TypeError, ValueError): - idx = 0 - if idx < 0 or idx >= len(runs): - idx = 0 - return runs[idx] - - -def _keyword_hits( - ranked: list[dict[str, Any]], - head: dict[str, Any], - keywords: list[str] | None, -) -> list[dict[str, Any]]: - if not keywords: - return [] - lowered = [kw.lower() for kw in keywords if isinstance(kw, str) and kw.strip()] - if not lowered: - return [] - hits: list[dict[str, Any]] = [] - for item in ranked: - if item is head: - continue - text = str(item.get("text") or "").lower() - if any(kw in text for kw in lowered): - hits.append(item) - return hits - - -def _select_chunks( - chunks: list[dict[str, Any]], - scores: dict[str, float], - plan: ModePlan, - keywords: list[str] | None = None, - must_ids: list[str] | None = None, -) -> list[dict[str, Any]]: - if not chunks: - return [] - ranked = sorted(chunks, key=lambda item: scores.get(item["id"], 0.0), reverse=True) - selected: list[dict[str, Any]] = [chunks[0]] - if _append_must_chunks(chunks, selected, must_ids, plan.chunk_top): - return selected - if _append_keyword_chunks(ranked, selected, keywords, plan.chunk_top): - return selected - _append_ranked_chunks(ranked, selected, plan.chunk_top) - return selected - - -def _append_must_chunks( - chunks: list[dict[str, Any]], - selected: list[dict[str, Any]], - must_ids: list[str] | None, - limit: int, -) -> bool: - if not must_ids: - return False - id_map = {item["id"]: item for item in chunks} - for cid in must_ids: - item = id_map.get(cid) - if item and item not in selected: - selected.append(item) - if len(selected) >= limit: - return True - return False - - -def _append_keyword_chunks( - ranked: list[dict[str, Any]], - selected: list[dict[str, Any]], - keywords: list[str] | None, - limit: int, -) -> bool: - if not ranked: - return False - head = ranked[0] - for item in _keyword_hits(ranked, head, keywords): - if item not in selected: - selected.append(item) - if len(selected) >= limit: - return True - return False - - -def _append_ranked_chunks( - ranked: list[dict[str, Any]], - selected: list[dict[str, Any]], - limit: int, -) -> None: - for item in ranked: - if len(selected) >= limit: - break - if item not in selected: - selected.append(item) - - -def _format_runbooks(runbooks: list[str]) -> str: - if not runbooks: - return "" - return "Relevant runbooks:\n" + "\n".join([f"- {item}" for item in runbooks]) - - -def _join_context(parts: list[str]) -> str: - text = "\n".join([part for part in parts if part]) - return text.strip() - - -def _format_history(history: list[dict[str, str]] | None) -> str: - if not history: - return "" - lines = ["Recent conversation (non-authoritative):"] - for entry in history[-4:]: - if not isinstance(entry, dict): - continue - question = entry.get("q") - answer = entry.get("a") - role = entry.get("role") - content = entry.get("content") - if question: - lines.append(f"Q: {question}") - if answer: - lines.append(f"A: {answer}") - if role and content: - prefix = "Q" if role == "user" else "A" - lines.append(f"{prefix}: {content}") - return "\n".join(lines) - - -def _summary_lines(snapshot: dict[str, Any] | None) -> list[str]: - text = summary_text(snapshot) - if not text: - return [] - return [line for line in text.splitlines() if line.strip()] - - -def _line_starting_with(lines: list[str], prefix: str) -> str | None: - if not lines: - return None - for line in lines: - if line.lower().startswith(prefix.lower()): - return line - return None - - -def _spine_lines(lines: list[str]) -> dict[str, str]: - spine: dict[str, str] = {} - _spine_nodes(lines, spine) - _spine_hardware(lines, spine) - _spine_hottest(lines, spine) - _spine_postgres(lines, spine) - _spine_namespaces(lines, spine) - _spine_pressure(lines, spine) - return spine - - -def _spine_nodes(lines: list[str], spine: dict[str, str]) -> None: - nodes_line = _line_starting_with(lines, "nodes:") - if nodes_line: - spine["nodes_count"] = nodes_line - spine["nodes_ready"] = nodes_line - return - nodes_total = _line_starting_with(lines, "nodes_total:") - nodes_ready = _line_starting_with(lines, "nodes_ready:") - if nodes_total: - spine["nodes_count"] = nodes_total - if nodes_ready: - spine["nodes_ready"] = nodes_ready - - -def _spine_hardware(lines: list[str], spine: dict[str, str]) -> None: - hardware_line = _line_starting_with(lines, "hardware_nodes:") - if not hardware_line: - hardware_line = _line_starting_with(lines, "hardware:") - if hardware_line: - spine["nodes_non_rpi"] = hardware_line - - -def _spine_hottest(lines: list[str], spine: dict[str, str]) -> None: - hottest_line = _line_starting_with(lines, "hottest:") - if not hottest_line: - return - for key in ("hottest_cpu", "hottest_ram", "hottest_net", "hottest_io", "hottest_disk"): - spine[key] = hottest_line - - -def _spine_postgres(lines: list[str], spine: dict[str, str]) -> None: - postgres_total = _line_starting_with(lines, "postgres_connections_total:") - if postgres_total: - spine["postgres_connections"] = postgres_total - postgres_line = _line_starting_with(lines, "postgres:") - if postgres_line: - spine["postgres_hottest"] = postgres_line - - -def _spine_namespaces(lines: list[str], spine: dict[str, str]) -> None: - namespaces_top = _line_starting_with(lines, "namespaces_top:") - if namespaces_top: - spine["namespace_most_pods"] = namespaces_top - - -def _spine_pressure(lines: list[str], spine: dict[str, str]) -> None: - pressure_line = _line_starting_with(lines, "pressure_nodes:") - if pressure_line: - spine["pressure_summary"] = pressure_line - return - load_line = _line_starting_with(lines, "node_load_top:") - if load_line: - spine["pressure_summary"] = load_line - - -def _parse_group_line(line: str) -> dict[str, list[str]]: - groups: dict[str, list[str]] = {} - if not line: - return groups - payload = line.split(":", 1)[1] if ":" in line else line - for part in payload.split(";"): - part = part.strip() - if not part or "=" not in part: - continue - key, value = part.split("=", 1) - value = value.strip() - nodes: list[str] = [] - if "(" in value and ")" in value: - inner = value[value.find("(") + 1 : value.rfind(")")] - nodes = [item.strip() for item in inner.split(",") if item.strip()] - if not nodes: - cleaned = re.sub(r"^[0-9]+", "", value).strip() - nodes = [item.strip() for item in cleaned.split(",") if item.strip()] - groups[key.strip()] = nodes - return groups - - -def _parse_hottest(line: str, metric: str) -> str | None: - if not line: - return None - payload = line.split(":", 1)[1] if ":" in line else line - for part in payload.split(";"): - part = part.strip() - if part.startswith(f"{metric}="): - return part - return None - - -def _spine_answer(intent: IntentMatch, spine_line: str | None) -> str | None: - if not spine_line: - return None - handlers = { - "nodes_count": _spine_nodes_answer, - "nodes_ready": _spine_nodes_answer, - "nodes_non_rpi": _spine_non_rpi_answer, - "hardware_mix": _spine_hardware_answer, - "postgres_connections": _spine_postgres_answer, - "postgres_hottest": _spine_postgres_answer, - "namespace_most_pods": _spine_namespace_answer, - "pressure_summary": _spine_pressure_answer, - } - kind = intent.kind - if kind.startswith("hottest_"): - return _spine_hottest_answer(kind, spine_line) - handler = handlers.get(kind) - if handler: - return handler(spine_line) - return spine_line - - -def _spine_nodes_answer(line: str) -> str: - return line - - -def _spine_non_rpi_answer(line: str) -> str: - groups = _parse_group_line(line) - non_rpi: list[str] = [] - for key, nodes in groups.items(): - if key.lower().startswith("rpi"): - continue - non_rpi.extend(nodes) - if non_rpi: - return "Non‑Raspberry Pi nodes: " + ", ".join(non_rpi) + "." - return line - - -def _spine_hardware_answer(line: str) -> str: - return line - - -def _spine_hottest_answer(kind: str, line: str) -> str: - metric = kind.split("_", 1)[1] - hottest = _parse_hottest(line, metric) - if hottest: - return hottest - return line - - -def _spine_postgres_answer(line: str) -> str: - return line - - -def _spine_namespace_answer(line: str) -> str: - payload = line.split(":", 1)[1] if ":" in line else line - top = payload.split(";")[0].strip() - if top: - return f"Namespace with most pods: {top}." - return line - - -def _spine_pressure_answer(line: str) -> str: - return line - - -def _spine_from_summary(summary: dict[str, Any]) -> dict[str, str]: - if not isinstance(summary, dict) or not summary: - return {} - spine: dict[str, str] = {} - spine.update(_spine_from_counts(summary)) - spine.update(_spine_from_hardware(summary)) - spine.update(_spine_from_hottest(summary)) - spine.update(_spine_from_postgres(summary)) - spine.update(_spine_from_namespace_pods(summary)) - spine.update(_spine_from_pressure(summary)) - return spine - - -def _spine_from_counts(summary: dict[str, Any]) -> dict[str, str]: - counts = summary.get("counts") if isinstance(summary.get("counts"), dict) else {} - inventory = summary.get("inventory") if isinstance(summary.get("inventory"), dict) else {} - nodes = summary.get("nodes") if isinstance(summary.get("nodes"), dict) else {} - workers = inventory.get("workers") if isinstance(inventory.get("workers"), dict) else {} - total = nodes.get("total") - ready = nodes.get("ready") - not_ready = nodes.get("not_ready") - if total is None: - total = counts.get("nodes_total") - if ready is None: - ready = counts.get("nodes_ready") - if not_ready is None and isinstance(inventory.get("not_ready_names"), list): - not_ready = len(inventory.get("not_ready_names") or []) - workers_ready = workers.get("ready") - workers_total = workers.get("total") - if total is None and ready is None and not_ready is None: - return {} - parts = [] - if total is not None: - parts.append(f"total={int(total)}") - if ready is not None: - parts.append(f"ready={int(ready)}") - if not_ready is not None: - parts.append(f"not_ready={int(not_ready)}") - if workers_total is not None and workers_ready is not None: - parts.append(f"workers_ready={int(workers_ready)}/{int(workers_total)}") - line = "nodes: " + ", ".join(parts) - return {"nodes_count": line, "nodes_ready": line} - - -def _spine_from_hardware(summary: dict[str, Any]) -> dict[str, str]: - hardware = summary.get("hardware") if isinstance(summary.get("hardware"), dict) else {} - if not hardware: - return {} - parts = [] - for key, nodes in hardware.items(): - if not isinstance(nodes, list): - continue - node_list = ", ".join(str(n) for n in nodes if n) - if node_list: - parts.append(f"{key} ({node_list})") - if not parts: - return {} - return {"nodes_non_rpi": "hardware: " + "; ".join(parts)} - - -def _spine_from_hottest(summary: dict[str, Any]) -> dict[str, str]: - hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {} - top = summary.get("top") if isinstance(summary.get("top"), dict) else {} - top_hottest = top.get("node_hottest") if isinstance(top.get("node_hottest"), dict) else {} - if not hottest and top_hottest: - hottest = top_hottest - elif top_hottest: - for key, value in top_hottest.items(): - if key not in hottest and value is not None: - hottest[key] = value - if not hottest: - return {} - mapping = {} - for key in ("cpu", "ram", "net", "io", "disk"): - entry = hottest.get(key) - if not isinstance(entry, dict): - continue - node = entry.get("node") or entry.get("label") or "" - value = entry.get("value") - if node: - mapping[f"hottest_{key}"] = f"{key}={node} ({_format_metric_value(value)})" - if not mapping: - return {} - return mapping - - -def _spine_from_postgres(summary: dict[str, Any]) -> dict[str, str]: - postgres = summary.get("postgres") if isinstance(summary.get("postgres"), dict) else {} - if not postgres: - top = summary.get("top") if isinstance(summary.get("top"), dict) else {} - postgres = top.get("postgres") if isinstance(top.get("postgres"), dict) else {} - if not postgres: - return {} - used = postgres.get("used") - max_conn = postgres.get("max") - hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} - hottest_label = hottest.get("label") or "" - facts: dict[str, str] = {} - if used is not None and max_conn is not None: - facts["postgres_connections"] = f"postgres_connections_total: used={int(used)}, max={int(max_conn)}" - if hottest_label: - facts["postgres_hottest"] = f"postgres_hottest_db: {hottest_label}" - return facts - - -def _spine_from_namespace_pods(summary: dict[str, Any]) -> dict[str, str]: - pods = summary.get("namespace_pods") if isinstance(summary.get("namespace_pods"), list) else [] - if not pods: - top = summary.get("top") if isinstance(summary.get("top"), dict) else {} - pods = top.get("namespace_pods") if isinstance(top.get("namespace_pods"), list) else [] - if not pods: - return {} - best_name = "" - best_value = None - for entry in pods: - if not isinstance(entry, dict): - continue - name = entry.get("namespace") or entry.get("name") or entry.get("label") or "" - value = entry.get("pods") or entry.get("value") - try: - numeric = float(value) - except (TypeError, ValueError): - numeric = None - if name and numeric is not None and (best_value is None or numeric > best_value): - best_name = name - best_value = numeric - if best_name: - return {"namespace_most_pods": f"namespace_most_pods: {best_name} ({int(best_value or 0)} pods)"} - return {} - - -def _spine_from_pressure(summary: dict[str, Any]) -> dict[str, str]: - pressure = summary.get("pressure_summary") if isinstance(summary.get("pressure_summary"), dict) else {} - if not pressure: - return {} - total = pressure.get("total") - unsched = pressure.get("unschedulable") - parts = [] - if total is not None: - parts.append(f"total={int(total)}") - if unsched is not None: - parts.append(f"unschedulable={int(unsched)}") - if parts: - return {"pressure_summary": "pressure_nodes: " + ", ".join(parts)} - return {} - - -def _spine_fallback(intent: IntentMatch, lines: list[str]) -> str | None: - if not lines: - return None - keywords = { - "postgres_hottest": ("postgres_hottest", "hottest_db", "postgres"), - "namespace_most_pods": ("namespace", "pods", "namespaces_top"), - "pressure_summary": ("pressure", "node_load_top"), - } - for token in keywords.get(intent.kind, ("",)): - if not token: - continue - for line in lines: - if token in line: - return line - return None - - -def _format_metric_value(value: Any) -> str: - try: - num = float(value) - except (TypeError, ValueError): - return str(value) - if num >= BYTES_MB: - return f"{num / BYTES_MB:.2f} MB/s" - if num >= BYTES_KB: - return f"{num / BYTES_KB:.2f} KB/s" - if num >= 1: - return f"{num:.2f}" - return f"{num:.4f}" - - -async def _select_metric_chunks( - call_llm: Callable[..., Awaitable[str]], - ctx: dict[str, Any], - chunks: list[dict[str, Any]], - plan: ModePlan, -) -> tuple[list[str], list[str]]: - summary_lines, question, sub_questions, keywords, token_set = _metric_ctx_values(ctx) - if not summary_lines or not chunks: - return [], [] - keys = _extract_metric_keys(summary_lines) - if not keys: - return [], [] - max_keys = max(4, plan.max_subquestions * 2) - candidate_keys = _filter_metric_keys(keys, token_set) - available_keys = candidate_keys or keys - prompt = prompts.METRIC_KEYS_PROMPT.format(available="\n".join(available_keys), max_keys=max_keys) - raw = await call_llm( - prompts.METRIC_KEYS_SYSTEM, - prompt + "\nQuestion: " + str(question) + "\nSubQuestions:\n" + "\n".join([str(item) for item in sub_questions]), - context="Keywords:\n" + ", ".join([str(item) for item in keywords if item]), - model=plan.fast_model, - tag="metric_keys", - ) - selected = _parse_key_list(raw, available_keys, max_keys) - if candidate_keys: - selected = _merge_metric_keys(selected, candidate_keys, max_keys) - if selected and candidate_keys and not _metric_key_overlap(selected, token_set): - selected = candidate_keys[:max_keys] - if not selected and candidate_keys: - selected = candidate_keys[:max_keys] - if available_keys: - missing = await _validate_metric_keys( - call_llm, - { - "question": question, - "sub_questions": sub_questions, - "selected": selected, - }, - available_keys, - plan, - ) - if missing: - selected = _merge_metric_keys(selected, missing, max_keys) - if not selected: - return [], [] - ids = _chunk_ids_for_keys(chunks, selected) - return selected, ids - - -async def _validate_metric_keys( - call_llm: Callable[..., Awaitable[str]], - ctx: dict[str, Any], - available: list[str], - plan: ModePlan, -) -> list[str]: - if not available: - return [] - question = str(ctx.get("question") or "") - sub_questions = ctx.get("sub_questions") if isinstance(ctx.get("sub_questions"), list) else [] - selected = ctx.get("selected") if isinstance(ctx.get("selected"), list) else [] - cap = max(12, plan.max_subquestions * 4) - available_list = available[:cap] - prompt = prompts.METRIC_KEYS_VALIDATE_PROMPT.format( - question=question, - sub_questions=json.dumps(sub_questions), - selected=json.dumps(selected), - available="\n".join(available_list), - ) - raw = await call_llm( - prompts.METRIC_KEYS_VALIDATE_SYSTEM, - prompt, - model=plan.fast_model, - tag="metric_keys_validate", - ) - parsed = _parse_json_block(raw, fallback={}) - items = parsed.get("missing") if isinstance(parsed, dict) else [] - if not isinstance(items, list): - return [] - available_set = set(available_list) - out: list[str] = [] - for item in items: - if isinstance(item, str) and item in available_set and item not in out: - out.append(item) - return out - - -async def _gather_limited(coros: list[Awaitable[Any]], limit: int) -> list[Any]: - if not coros: - return [] - semaphore = asyncio.Semaphore(max(1, limit)) - - async def runner(coro: Awaitable[Any]) -> Any: - async with semaphore: - return await coro - - return await asyncio.gather(*(runner(coro) for coro in coros)) - - -def _metric_ctx_values(ctx: dict[str, Any]) -> tuple[list[str], str, list[str], list[str], set[str]]: - summary_lines = ctx.get("summary_lines") if isinstance(ctx, dict) else None - if not isinstance(summary_lines, list): - return [], "", [], [], set() - question = ctx.get("question") if isinstance(ctx, dict) else "" - sub_questions = ctx.get("sub_questions") if isinstance(ctx, dict) else [] - keywords = ctx.get("keywords") if isinstance(ctx, dict) else [] - keyword_tokens = ctx.get("keyword_tokens") if isinstance(ctx, dict) else [] - token_set = {str(token).lower() for token in keyword_tokens if token} - token_set |= {token.lower() for token in _extract_keywords(str(question), str(question), sub_questions=sub_questions, keywords=keywords)} - token_set = _token_variants(token_set) - return summary_lines, str(question), sub_questions, keywords, token_set - - -def _extract_metric_keys(lines: list[str]) -> list[str]: - keys: list[str] = [] - for line in lines: - if ":" not in line: - continue - key = line.split(":", 1)[0].strip() - if not key or " " in key: - continue - if key not in keys: - keys.append(key) - return keys - - -def _token_variants(tokens: set[str]) -> set[str]: - if not tokens: - return set() - variants = set(tokens) - for token in list(tokens): - if len(token) <= TOKEN_MIN_LEN: - continue - if token.endswith("ies") and len(token) > TOKEN_MIN_LEN: - variants.add(token[:-3] + "y") - if token.endswith("es") and len(token) > TOKEN_MIN_LEN: - variants.add(token[:-2]) - if token.endswith("s") and len(token) > TOKEN_MIN_LEN: - variants.add(token[:-1]) - return variants - - -def _parse_key_list(raw: str, allowed: list[str], max_keys: int) -> list[str]: - parsed = _parse_json_block(raw, fallback={}) - if isinstance(parsed, list): - items = parsed - else: - items = parsed.get("keys") if isinstance(parsed, dict) else [] - if not isinstance(items, list): - return [] - allowed_set = set(allowed) - out: list[str] = [] - for item in items: - if not isinstance(item, str): - continue - if item in allowed_set and item not in out: - out.append(item) - if len(out) >= max_keys: - break - return out - - -def _chunk_ids_for_keys(chunks: list[dict[str, Any]], keys: list[str]) -> list[str]: - if not keys: - return [] - ids: list[str] = [] - key_set = {f"{key}:" for key in keys} - for chunk in chunks: - text = str(chunk.get("text") or "") - if not text: - continue - for line in text.splitlines(): - for key in key_set: - if line.startswith(key): - cid = chunk.get("id") - if cid and cid not in ids: - ids.append(cid) - break - return ids - - -def _filter_metric_keys(keys: list[str], tokens: set[str]) -> list[str]: - if not keys or not tokens: - return [] - lowered_tokens = {token.lower() for token in tokens if token and len(token) >= TOKEN_MIN_LEN} - ranked: list[tuple[int, str]] = [] - for key in keys: - parts = [part for part in re.split(r"[_\\W]+", key.lower()) if part] - if not parts: - continue - hits = len(set(parts) & lowered_tokens) - if hits: - ranked.append((hits, key)) - ranked.sort(key=lambda item: (-item[0], item[1])) - return [item[1] for item in ranked] - - -def _metric_key_overlap(keys: list[str], tokens: set[str]) -> bool: - if not keys or not tokens: - return False - lowered_tokens = {token.lower() for token in tokens if token and len(token) >= TOKEN_MIN_LEN} - for key in keys: - parts = [part for part in re.split(r"[_\\W]+", key.lower()) if part] - if set(parts) & lowered_tokens: - return True - return False - - -def _lines_for_metric_keys(lines: list[str], keys: list[str], max_lines: int = 0) -> list[str]: - if not lines or not keys: - return [] - prefixes = {f"{key}:" for key in keys} - selected: list[str] = [] - for line in lines: - for prefix in prefixes: - if prefix in line: - selected.append(line) - break - if max_lines and len(selected) >= max_lines: - break - return selected - - -def _merge_metric_keys(current: list[str], candidates: list[str], max_keys: int) -> list[str]: - merged: list[str] = [] - seen = set() - for key in current: - if key and key not in seen: - merged.append(key) - seen.add(key) - for key in candidates: - if key and key not in seen: - merged.append(key) - seen.add(key) - if len(merged) >= max_keys: - break - return merged[:max_keys] - - -def _merge_fact_lines(primary: list[str], fallback: list[str]) -> list[str]: - seen = set() - merged: list[str] = [] - for line in primary + fallback: - if line in seen: - continue - seen.add(line) - merged.append(line) - return merged - - -def _expand_hottest_line(line: str) -> list[str]: - if not line: - return [] - if not line.lower().startswith("hottest:"): - return [] - expanded: list[str] = [] - payload = line.split("hottest:", 1)[1] - for part in payload.split(";"): - part = part.strip() - if not part or "=" not in part: - continue - metric, rest = part.split("=", 1) - metric = metric.strip() - match = re.search(r"(?P[^\s\[]+).*\((?P[^)]+)\)", rest) - if not match: - continue - node = match.group("node").strip() - value = match.group("value").strip() - class_match = re.search(r"\[(?P[^\]]+)\]", rest) - node_class = class_match.group("class").strip() if class_match else "" - if node_class: - expanded.append(f"hottest_{metric}_node: {node} [{node_class}] ({value})") - else: - expanded.append(f"hottest_{metric}_node: {node} ({value})") - return expanded - - -def _has_token(text: str, token: str) -> bool: - if not text or not token: - return False - if token == "io": - return "i/o" in text or re.search(r"\bio\b", text) is not None - return re.search(rf"\b{re.escape(token)}\b", text) is not None - - -def _hotspot_evidence(summary: dict[str, Any]) -> list[str]: - hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {} - if not hottest: - return [] - hardware_by_node = summary.get("hardware_by_node") if isinstance(summary.get("hardware_by_node"), dict) else {} - node_pods_top = summary.get("node_pods_top") if isinstance(summary.get("node_pods_top"), list) else [] - ns_map = {} - for item in node_pods_top: - if not isinstance(item, dict): - continue - node = item.get("node") - namespaces_top = item.get("namespaces_top") if isinstance(item.get("namespaces_top"), list) else [] - ns_map[node] = namespaces_top - lines: list[str] = [] - for metric, info in hottest.items(): - if not isinstance(info, dict): - continue - node = info.get("node") - value = info.get("value") - if not node: - continue - node_class = hardware_by_node.get(node) - ns_parts = [] - for entry in ns_map.get(node, [])[:3]: - if isinstance(entry, (list, tuple)) and len(entry) >= NS_ENTRY_MIN_LEN: - ns_parts.append(f"{entry[0]}={entry[1]}") - ns_text = ", ".join(ns_parts) - value_text = f"{value:.2f}" if isinstance(value, (int, float)) else str(value) - line = f"hotspot.{metric}: node={node} class={node_class or 'unknown'} value={value_text}" - if ns_text: - line += f" namespaces_top={ns_text}" - lines.append(line) - return lines - - -def _metric_key_tokens(summary_lines: list[str]) -> set[str]: - tokens: set[str] = set() - for line in summary_lines: - if not isinstance(line, str) or ":" not in line: - continue - key = line.split(":", 1)[0].strip().lower() - if not key: - continue - tokens.add(key) - for part in re.split(r"[_\\s]+", key): - if part: - tokens.add(part) - return tokens - - -async def _select_best_candidate( - call_llm: Callable[..., Any], - question: str, - candidates: list[str], - plan: ModePlan, - tag: str, -) -> int: - if len(candidates) <= 1: - return 0 - prompt = ( - prompts.CANDIDATE_SELECT_PROMPT - + "\nQuestion: " - + question - + "\nCandidates:\n" - + "\n".join([f"{idx+1}) {cand}" for idx, cand in enumerate(candidates)]) - ) - raw = await call_llm(prompts.CANDIDATE_SELECT_SYSTEM, prompt, model=plan.model, tag=tag) - data = _parse_json_block(raw, fallback={}) - best = data.get("best") if isinstance(data, dict) else None - if isinstance(best, int) and 1 <= best <= len(candidates): - return best - 1 - return 0 - - -def _dedupe_lines(lines: list[str], limit: int | None = None) -> list[str]: - seen: set[str] = set() - cleaned: list[str] = [] - for line in lines: - value = (line or "").strip() - if not value or value in seen: - continue - if value.lower().startswith("lexicon_") or value.lower().startswith("units:"): - continue - cleaned.append(value) - seen.add(value) - if limit and len(cleaned) >= limit: - break - return cleaned - - -def _collect_fact_candidates(selected: list[dict[str, Any]], limit: int) -> list[str]: - lines: list[str] = [] - for chunk in selected: - text = chunk.get("text") if isinstance(chunk, dict) else None - if not isinstance(text, str): - continue - lines.extend([line for line in text.splitlines() if line.strip()]) - return _dedupe_lines(lines, limit=limit) - - -async def _select_best_list( - call_llm: Callable[..., Any], - question: str, - candidates: list[list[str]], - plan: ModePlan, - tag: str, -) -> list[str]: - if not candidates: - return [] - if len(candidates) == 1: - return candidates[0] - render = ["; ".join(items) for items in candidates] - best_idx = await _select_best_candidate(call_llm, question, render, plan, tag) - chosen = candidates[best_idx] if 0 <= best_idx < len(candidates) else candidates[0] - if not chosen: - merged: list[str] = [] - for entry in candidates: - for item in entry: - if item not in merged: - merged.append(item) - chosen = merged - return chosen - - -async def _extract_fact_types( - call_llm: Callable[..., Any], - question: str, - keywords: list[str], - plan: ModePlan, -) -> list[str]: - prompt = prompts.FACT_TYPES_PROMPT + "\nQuestion: " + question - if keywords: - prompt += "\nKeywords: " + ", ".join(keywords) - candidates: list[list[str]] = [] - attempts = max(plan.metric_retries, 1) - for _ in range(attempts): - raw = await call_llm(prompts.FACT_TYPES_SYSTEM, prompt, model=plan.fast_model, tag="fact_types") - data = _parse_json_block(raw, fallback={}) - items = data.get("fact_types") if isinstance(data, dict) else None - if not isinstance(items, list): - continue - cleaned = _dedupe_lines([str(item) for item in items if isinstance(item, (str, int, float))], limit=10) - if cleaned: - candidates.append(cleaned) - chosen = await _select_best_list(call_llm, question, candidates, plan, "fact_types_select") - return chosen[:10] - - -async def _derive_signals( - call_llm: Callable[..., Any], - question: str, - fact_types: list[str], - plan: ModePlan, -) -> list[str]: - if not fact_types: - return [] - prompt = prompts.SIGNAL_PROMPT.format(question=question, fact_types="; ".join(fact_types)) - candidates: list[list[str]] = [] - attempts = max(plan.metric_retries, 1) - for _ in range(attempts): - raw = await call_llm(prompts.SIGNAL_SYSTEM, prompt, model=plan.fast_model, tag="signals") - data = _parse_json_block(raw, fallback={}) - items = data.get("signals") if isinstance(data, dict) else None - if not isinstance(items, list): - continue - cleaned = _dedupe_lines([str(item) for item in items if isinstance(item, (str, int, float))], limit=12) - if cleaned: - candidates.append(cleaned) - chosen = await _select_best_list(call_llm, question, candidates, plan, "signals_select") - return chosen[:12] - - -async def _scan_chunk_for_signals( - call_llm: Callable[..., Any], - question: str, - signals: list[str], - chunk_lines: list[str], - plan: ModePlan, -) -> list[str]: - if not signals or not chunk_lines: - return [] - prompt = prompts.CHUNK_SCAN_PROMPT.format( - signals="; ".join(signals), - lines="\n".join(chunk_lines), - ) - attempts = max(1, min(plan.metric_retries, 2)) - candidates: list[list[str]] = [] - for _ in range(attempts): - raw = await call_llm(prompts.CHUNK_SCAN_SYSTEM, prompt, model=plan.fast_model, tag="chunk_scan") - data = _parse_json_block(raw, fallback={}) - items = data.get("lines") if isinstance(data, dict) else None - if not isinstance(items, list): - continue - cleaned = [line for line in chunk_lines if line in items] - cleaned = _dedupe_lines(cleaned, limit=15) - if cleaned: - candidates.append(cleaned) - chosen = await _select_best_list(call_llm, question, candidates, plan, "chunk_scan_select") - return chosen[:15] - - -async def _prune_metric_candidates( - call_llm: Callable[..., Any], - question: str, - candidates: list[str], - plan: ModePlan, - attempts: int, -) -> list[str]: - if not candidates: - return [] - prompt = prompts.FACT_PRUNE_PROMPT.format(question=question, candidates="\n".join(candidates), max_lines=6) - picks: list[list[str]] = [] - for _ in range(max(attempts, 1)): - raw = await call_llm(prompts.FACT_PRUNE_SYSTEM, prompt, model=plan.fast_model, tag="fact_prune") - data = _parse_json_block(raw, fallback={}) - items = data.get("lines") if isinstance(data, dict) else None - if not isinstance(items, list): - continue - cleaned = [line for line in candidates if line in items] - cleaned = _dedupe_lines(cleaned, limit=6) - if cleaned: - picks.append(cleaned) - chosen = await _select_best_list(call_llm, question, picks, plan, "fact_prune_select") - return chosen[:6] - - -async def _select_fact_lines( - call_llm: Callable[..., Any], - question: str, - candidates: list[str], - plan: ModePlan, - max_lines: int, -) -> list[str]: - if not candidates: - return [] - prompt = prompts.FACT_PRUNE_PROMPT.format(question=question, candidates="\n".join(candidates), max_lines=max_lines) - picks: list[list[str]] = [] - attempts = max(plan.metric_retries, 1) - for _ in range(attempts): - raw = await call_llm(prompts.FACT_PRUNE_SYSTEM, prompt, model=plan.fast_model, tag="fact_select") - data = _parse_json_block(raw, fallback={}) - items = data.get("lines") if isinstance(data, dict) else None - if not isinstance(items, list): - continue - cleaned = [line for line in candidates if line in items] - cleaned = _dedupe_lines(cleaned, limit=max_lines) - if cleaned: - picks.append(cleaned) - chosen = await _select_best_list(call_llm, question, picks, plan, "fact_select_best") - return chosen[:max_lines] - - -def _strip_unknown_entities(reply: str, unknown_nodes: list[str], unknown_namespaces: list[str]) -> str: - if not reply: - return reply - if not unknown_nodes and not unknown_namespaces: - return reply - sentences = [s.strip() for s in re.split(r"(?<=[.!?])\\s+", reply) if s.strip()] - if not sentences: - return reply - lowered_nodes = [node.lower() for node in unknown_nodes] - lowered_namespaces = [ns.lower() for ns in unknown_namespaces] - kept: list[str] = [] - for sent in sentences: - lower = sent.lower() - if lowered_nodes and any(node in lower for node in lowered_nodes): - continue - if lowered_namespaces and any(f"namespace {ns}" in lower for ns in lowered_namespaces): - continue - kept.append(sent) - cleaned = " ".join(kept).strip() - return cleaned or reply - - -def _needs_evidence_guard(reply: str, facts: list[str]) -> bool: - if not reply or not facts: - return False - lower_reply = reply.lower() - fact_text = " ".join(facts).lower() - node_pattern = re.compile(r"\b(titan-[0-9a-z]+|node-?\d+)\b", re.IGNORECASE) - nodes = {m.group(1).lower() for m in node_pattern.finditer(reply)} - if nodes: - missing = [node for node in nodes if node not in fact_text] - if missing: - return True - pressure_terms = ("pressure", "diskpressure", "memorypressure", "pidpressure", "headroom") - if any(term in lower_reply for term in pressure_terms) and not any(term in fact_text for term in pressure_terms): - return True - arch_terms = ("amd64", "arm64", "rpi", "rpi4", "rpi5", "jetson") - if any(term in lower_reply for term in arch_terms) and not any(term in fact_text for term in arch_terms): - return True - return False - - -async def _contradiction_decision( - ctx: ContradictionContext, - attempts: int = 1, -) -> dict[str, Any]: - best = {"use_facts": True, "confidence": 50} - facts_block = "\n".join(ctx.facts[:12]) - for idx in range(max(1, attempts)): - variant = f"Variant: {idx + 1}" if attempts > 1 else "" - prompt = ( - prompts.CONTRADICTION_PROMPT.format(question=ctx.question, draft=ctx.reply, facts=facts_block) - + ("\n" + variant if variant else "") - ) - raw = await ctx.call_llm( - prompts.CONTRADICTION_SYSTEM, - prompt, - model=ctx.plan.fast_model, - tag="contradiction", - ) - data = _parse_json_block(raw, fallback={}) - try: - confidence = int(data.get("confidence", 50)) - except Exception: - confidence = 50 - use_facts = bool(data.get("use_facts", True)) - if confidence >= best.get("confidence", 0): - best = {"use_facts": use_facts, "confidence": confidence} - return best - - -def _filter_lines_by_keywords(lines: list[str], keywords: list[str], max_lines: int) -> list[str]: - if not lines: - return [] - tokens = _expand_tokens(keywords) - if not tokens: - return lines[:max_lines] - filtered = [line for line in lines if any(tok in line.lower() for tok in tokens)] - return (filtered or lines)[:max_lines] - - -def _rank_metric_lines(lines: list[str], tokens: set[str], max_lines: int) -> list[str]: - if not lines or not tokens: - return [] - ranked: list[tuple[int, int, str]] = [] - for line in lines: - lower = line.lower() - hits = sum(1 for tok in tokens if tok in lower) - if not hits: - continue - has_number = 1 if re.search(r"\d", line) else 0 - ranked.append((has_number, hits, line)) - ranked.sort(key=lambda item: (-item[0], -item[1], item[2])) - return [item[2] for item in ranked[:max_lines]] - - -def _select_metric_line(lines: list[str], question: str, tokens: list[str] | set[str]) -> str | None: - if not lines or not tokens: - return None - token_set = {str(tok).lower() for tok in tokens if tok} - ranked = _rank_metric_lines(lines, token_set, max_lines=6) - if not ranked: - return None - question_lower = (question or "").lower() - if any(term in question_lower for term in ("how many", "count", "total")): - for line in ranked: - lower = line.lower() - if "total" in lower or "count" in lower: - return line - return ranked[0] - - -def _format_direct_metric_line(line: str) -> str: - if not line: - return "" - if ":" in line: - formatted = _format_colon_metric(line) - if formatted: - return formatted - if "=" in line: - formatted = _format_equals_metric(line) - if formatted: - return formatted - return line - - -def _format_colon_metric(line: str) -> str | None: - key, value = line.split(":", 1) - key = key.strip().replace("_", " ") - value = value.strip() - if not value: - return None - if key == "nodes": - formatted = _format_nodes_value(value) - if formatted: - return formatted - if key in {"nodes total", "nodes_total"}: - return f"Atlas has {value} total nodes." - return f"{key} is {value}." - - -def _format_equals_metric(line: str) -> str | None: - pairs: list[str] = [] - for part in line.split(","): - if "=" not in part: - continue - key, value = part.split("=", 1) - key = key.strip().replace("_", " ") - value = value.strip() - if not value: - continue - if key in {"nodes total", "nodes_total"}: - return f"Atlas has {value} total nodes." - pairs.append(f"{key} is {value}") - if not pairs: - return None - if len(pairs) == 1: - return f"{pairs[0]}." - return "; ".join(pairs) + "." - - -def _format_nodes_value(value: str) -> str | None: - parts = [p.strip() for p in value.split(",") if p.strip()] - total = None - rest: list[str] = [] - for part in parts: - if part.startswith("total="): - total = part.split("=", 1)[1] - else: - rest.append(part.replace("_", " ")) - if not total: - return None - if rest: - return f"Atlas has {total} total nodes ({'; '.join(rest)})." - return f"Atlas has {total} total nodes." - - -def _global_facts(lines: list[str]) -> list[str]: - if not lines: - return [] - wanted = ("nodes_total", "nodes_ready", "cluster_name", "cluster", "nodes_not_ready") - facts: list[str] = [] - for line in lines: - lower = line.lower() - if any(key in lower for key in wanted): - facts.append(line) - return _dedupe_lines(facts, limit=6) - - -def _has_keyword_overlap(lines: list[str], keywords: list[str]) -> bool: - if not lines or not keywords: - return False - tokens = _expand_tokens(keywords) - if not tokens: - return False - for line in lines: - lower = line.lower() - if any(tok in lower for tok in tokens): - return True - return False - - -def _merge_tokens(primary: list[str], secondary: list[str], third: list[str] | None = None) -> list[str]: - merged: list[str] = [] - for token in primary + secondary + (third or []): - if not token: - continue - if token not in merged: - merged.append(token) - return merged - - -def _extract_question_tokens(question: str) -> list[str]: - if not question: - return [] - tokens: list[str] = [] - for part in re.split(r"[^a-zA-Z0-9_-]+", question.lower()): - if len(part) < TOKEN_MIN_LEN: - continue - if part not in tokens: - tokens.append(part) - return tokens - - -def _expand_tokens(tokens: list[str]) -> list[str]: - if not tokens: - return [] - expanded: list[str] = [] - for token in tokens: - if not isinstance(token, str): - continue - for part in re.split(r"[^a-zA-Z0-9_-]+", token.lower()): - if len(part) < TOKEN_MIN_LEN: - continue - if part not in expanded: - expanded.append(part) - return expanded - - -def _ensure_token_coverage( - lines: list[str], - tokens: list[str], - summary_lines: list[str], - max_add: int = 4, -) -> list[str]: - if not lines or not tokens or not summary_lines: - return lines - hay = " ".join(lines).lower() - missing = [tok for tok in tokens if tok and tok.lower() not in hay] - if not missing: - return lines - added: list[str] = [] - for token in missing: - token_lower = token.lower() - for line in summary_lines: - if token_lower in line.lower() and line not in lines and line not in added: - added.append(line) - break - if len(added) >= max_add: - break - if not added: - return lines - return _merge_fact_lines(added, lines) - - -def _best_keyword_line(lines: list[str], keywords: list[str]) -> str | None: - if not lines or not keywords: - return None - tokens = _expand_tokens(keywords) - if not tokens: - return None - best = None - best_score = 0 - for line in lines: - lower = line.lower() - score = sum(1 for tok in tokens if tok in lower) - if score > best_score: - best_score = score - best = line - return best if best_score > 0 else None - - -def _line_starting_with(lines: list[str], prefix: str) -> str | None: - if not lines or not prefix: - return None - lower_prefix = prefix.lower() - for line in lines: - if str(line).lower().startswith(lower_prefix): - return line - return None - - -def _non_rpi_nodes(summary: dict[str, Any]) -> dict[str, list[str]]: - hardware = summary.get("hardware_by_node") if isinstance(summary, dict) else None - if not isinstance(hardware, dict): - return {} - grouped: dict[str, list[str]] = {} - for node, hw in hardware.items(): - if not isinstance(node, str) or not isinstance(hw, str): - continue - if hw.startswith("rpi"): - continue - grouped.setdefault(hw, []).append(node) - for nodes in grouped.values(): - nodes.sort() - return grouped - - -def _format_hardware_groups(groups: dict[str, list[str]], label: str) -> str: - if not groups: - return "" - parts = [] - for hw, nodes in sorted(groups.items()): - parts.append(f"{hw} ({', '.join(nodes)})") - return f"{label}: " + "; ".join(parts) + "." - - -def _lexicon_context(summary: dict[str, Any]) -> str: # noqa: C901 - if not isinstance(summary, dict): - return "" - lexicon = summary.get("lexicon") - if not isinstance(lexicon, dict): - return "" - terms = lexicon.get("terms") - aliases = lexicon.get("aliases") - lines: list[str] = [] - if isinstance(terms, list): - for entry in terms[:8]: - if not isinstance(entry, dict): - continue - term = entry.get("term") - meaning = entry.get("meaning") - if term and meaning: - lines.append(f"{term}: {meaning}") - if isinstance(aliases, dict): - for key, value in list(aliases.items())[:6]: - if key and value: - lines.append(f"alias {key} -> {value}") - if not lines: - return "" - return "Lexicon:\n" + "\n".join(lines) - - -def _parse_json_block(text: str, *, fallback: dict[str, Any]) -> dict[str, Any]: - raw = text.strip() - match = re.search(r"\{.*\}", raw, flags=re.S) - if match: - return parse_json(match.group(0), fallback=fallback) - return parse_json(raw, fallback=fallback) - - -def _parse_json_list(text: str) -> list[dict[str, Any]]: - raw = text.strip() - match = re.search(r"\[.*\]", raw, flags=re.S) - data = parse_json(match.group(0), fallback={}) if match else parse_json(raw, fallback={}) - if isinstance(data, list): - return [entry for entry in data if isinstance(entry, dict)] - return [] - - -def _scores_from_json(data: dict[str, Any]) -> AnswerScores: - return AnswerScores( - confidence=_coerce_int(data.get("confidence"), 60), - relevance=_coerce_int(data.get("relevance"), 60), - satisfaction=_coerce_int(data.get("satisfaction"), 60), - hallucination_risk=str(data.get("hallucination_risk") or "medium"), - ) - - -def _coerce_int(value: Any, default: int) -> int: - try: - return int(float(value)) - except (TypeError, ValueError): - return default - - -def _default_scores() -> AnswerScores: - return AnswerScores(confidence=60, relevance=60, satisfaction=60, hallucination_risk="medium") - - -def _style_hint(classify: dict[str, Any]) -> str: - style = (classify.get("answer_style") or "").strip().lower() - qtype = (classify.get("question_type") or "").strip().lower() - if style == "insightful" or qtype in {"open_ended", "planning"}: - return "insightful" - return "direct" - - -def _needs_evidence_fix(reply: str, classify: dict[str, Any]) -> bool: - if not reply: - return False - lowered = reply.lower() - missing_markers = ( - "don't have", - "do not have", - "don't know", - "cannot", - "can't", - "need to", - "would need", - "does not provide", - "does not mention", - "not mention", - "not provided", - "not in context", - "not referenced", - "missing", - "no specific", - "no information", - ) - if classify.get("needs_snapshot") and any(marker in lowered for marker in missing_markers): - return True - if classify.get("question_type") in {"metric", "diagnostic"} and not re.search(r"\d", reply): - return True - return False - - -def _should_use_insight_guard(classify: dict[str, Any]) -> bool: - style = (classify.get("answer_style") or "").strip().lower() - qtype = (classify.get("question_type") or "").strip().lower() - return style == "insightful" or qtype in {"open_ended", "planning"} - - -async def _apply_insight_guard(inputs: InsightGuardInput) -> str: - if not inputs.reply or not _should_use_insight_guard(inputs.classify): - return inputs.reply - guard_prompt = prompts.INSIGHT_GUARD_PROMPT.format(question=inputs.question, answer=inputs.reply) - guard_raw = await inputs.call_llm( - prompts.INSIGHT_GUARD_SYSTEM, - guard_prompt, - context=inputs.context, - model=inputs.plan.fast_model, - tag="insight_guard", - ) - guard = _parse_json_block(guard_raw, fallback={}) - if guard.get("ok") is True: - return inputs.reply - fix_prompt = prompts.INSIGHT_FIX_PROMPT.format(question=inputs.question, answer=inputs.reply) - if inputs.facts: - fix_prompt = fix_prompt + "\nFacts:\n" + "\n".join(inputs.facts[:6]) - return await inputs.call_llm( - prompts.INSIGHT_FIX_SYSTEM, - fix_prompt, - context=inputs.context, - model=inputs.plan.model, - tag="insight_fix", - ) - - -def _reply_matches_metric_facts(reply: str, metric_facts: list[str], tokens: list[str] | set[str] | None = None) -> bool: - if not reply or not metric_facts: - return True - reply_numbers = set(re.findall(r"\d+(?:\\.\d+)?", reply)) - if not reply_numbers: - return False - fact_numbers: set[str] = set() - value_pattern = re.compile(r"(?:>=|<=|=|:)\\s*(\\d+(?:\\.\\d+)?)") - filtered = metric_facts - if tokens: - token_set = {str(tok).lower() for tok in tokens if tok} - focused = [] - for line in metric_facts: - key = line.split(":", 1)[0].lower() - if any(tok in key for tok in token_set): - focused.append(line) - if focused: - filtered = focused - for line in filtered: - for match in value_pattern.findall(line): - fact_numbers.add(match) - if not fact_numbers: - return False - return bool(reply_numbers & fact_numbers) - - -def _needs_dedup(reply: str) -> bool: - if not reply: - return False - sentences = [s.strip() for s in re.split(r"(?<=[.!?])\\s+", reply) if s.strip()] - if len(sentences) < DEDUP_MIN_SENTENCES: - return False - seen = set() - for sent in sentences: - norm = re.sub(r"\\s+", " ", sent.lower()) - if norm in seen: - return True - seen.add(norm) - return False - - -def _needs_focus_fix(question: str, reply: str, classify: dict[str, Any]) -> bool: - if not reply: - return False - q_lower = (question or "").lower() - if classify.get("question_type") not in {"metric", "diagnostic"} and not re.search(r"\b(how many|list|count)\b", q_lower): - return False - missing_markers = ( - "does not provide", - "does not specify", - "not available", - "not provided", - "cannot determine", - "don't have", - "do not have", - "insufficient", - "no data", - ) - if any(marker in reply.lower() for marker in missing_markers): - return True - if reply.count(".") <= 1: - return False - extra_markers = ("for more", "if you need", "additional", "based on") - return any(marker in reply.lower() for marker in extra_markers) - - -def _extract_keywords( - raw_question: str, - normalized: str, - sub_questions: list[str], - keywords: list[Any] | None, -) -> list[str]: - stopwords = { - "the", - "and", - "for", - "with", - "that", - "this", - "what", - "which", - "when", - "where", - "who", - "why", - "how", - "tell", - "show", - "list", - "give", - "about", - "right", - "now", - } - tokens: list[str] = [] - for source in [raw_question, normalized, *sub_questions]: - for part in re.split(r"[^a-zA-Z0-9_-]+", source.lower()): - if len(part) < TOKEN_MIN_LEN or part in stopwords: - continue - tokens.append(part) - if keywords: - for kw in keywords: - if isinstance(kw, str): - part = kw.strip().lower() - if part and part not in stopwords and part not in tokens: - tokens.append(part) - return list(dict.fromkeys(tokens))[:12] - - -def _allowed_nodes(summary: dict[str, Any]) -> list[str]: - hardware = summary.get("hardware_by_node") if isinstance(summary.get("hardware_by_node"), dict) else {} - if hardware: - return sorted([node for node in hardware.keys() if isinstance(node, str)]) - return [] - - -def _allowed_namespaces(summary: dict[str, Any]) -> list[str]: - namespaces: list[str] = [] - for entry in summary.get("namespace_pods") or []: - if isinstance(entry, dict): - name = entry.get("namespace") - if name: - namespaces.append(str(name)) - return sorted(set(namespaces)) - - -def _find_unknown_nodes(reply: str, allowed: list[str]) -> list[str]: - if not reply or not allowed: - return [] - pattern = re.compile(r"\b(titan-[0-9a-z]+|node-?\d+)\b", re.IGNORECASE) - found = {m.group(1) for m in pattern.finditer(reply)} - if not found: - return [] - allowed_set = {a.lower() for a in allowed} - return sorted({item for item in found if item.lower() not in allowed_set}) - - -def _find_unknown_namespaces(reply: str, allowed: list[str]) -> list[str]: - if not reply or not allowed: - return [] - pattern = re.compile(r"\bnamespace\s+([a-z0-9-]+)\b", re.IGNORECASE) - found = {m.group(1) for m in pattern.finditer(reply)} - if not found: - return [] - allowed_set = {a.lower() for a in allowed} - return sorted({item for item in found if item.lower() not in allowed_set}) - - -def _needs_runbook_fix(reply: str, allowed: list[str]) -> bool: - if not reply or not allowed: - return False - paths = set(re.findall(r"runbooks/[A-Za-z0-9._-]+", reply)) - if not paths: - return False - allowed_set = {p.lower() for p in allowed} - return any(path.lower() not in allowed_set for path in paths) - - -def _needs_runbook_reference(question: str, allowed: list[str], reply: str) -> bool: - if not allowed or not question: - return False - lowered = question.lower() - cues = ("runbook", "checklist", "documented", "documentation", "where", "guide") - if not any(cue in lowered for cue in cues): - return False - if not reply: - return True - for token in re.findall(r"runbooks/[A-Za-z0-9._-]+", reply): - if token.lower() in {p.lower() for p in allowed}: - return False - return True - - -def _best_runbook_match(candidate: str, allowed: list[str]) -> str | None: - if not candidate or not allowed: - return None - best = None - best_score = 0.0 - for path in allowed: - score = difflib.SequenceMatcher(a=candidate.lower(), b=path.lower()).ratio() - if score > best_score: - best_score = score - best = path - return best if best_score >= RUNBOOK_SIMILARITY_THRESHOLD else None - - -def _resolve_path(data: Any, path: str) -> Any | None: - if path.startswith("line:"): - return path.split("line:", 1)[1].strip() - cursor = data - for part in re.split(r"\.(?![^\[]*\])", path): - if not part: - continue - match = re.match(r"^(\w+)(?:\[(\d+)\])?$", part) - if not match: - return None - key = match.group(1) - index = match.group(2) - if isinstance(cursor, dict): - cursor = cursor.get(key) - else: - return None - if index is not None: - try: - idx = int(index) - if isinstance(cursor, list) and 0 <= idx < len(cursor): - cursor = cursor[idx] - else: - return None - except ValueError: - return None - return cursor - - -def _snapshot_id(summary: dict[str, Any]) -> str | None: - if not summary: - return None - for key in ("generated_at", "snapshot_ts", "snapshot_id"): - value = summary.get(key) - if isinstance(value, str) and value: - return value - return None - - -def _claims_to_payload(claims: list[ClaimItem]) -> list[dict[str, Any]]: - output: list[dict[str, Any]] = [] - for claim in claims: - evidence = [] - for ev in claim.evidence: - evidence.append( - { - "path": ev.path, - "reason": ev.reason, - "value_at_claim": ev.value_at_claim, - } - ) - output.append({"id": claim.id, "claim": claim.claim, "evidence": evidence}) - return output - - -def _state_from_payload(payload: dict[str, Any] | None) -> ConversationState | None: - if not payload: - return None - claims_raw = payload.get("claims") if isinstance(payload, dict) else None - claims: list[ClaimItem] = [] - if isinstance(claims_raw, list): - for entry in claims_raw: - if not isinstance(entry, dict): - continue - claim_text = str(entry.get("claim") or "").strip() - claim_id = str(entry.get("id") or "").strip() - if not claim_text or not claim_id: - continue - evidence_items: list[EvidenceItem] = [] - for ev in entry.get("evidence") or []: - if not isinstance(ev, dict): - continue - path = str(ev.get("path") or "").strip() - if not path: - continue - reason = str(ev.get("reason") or "").strip() - value_at_claim = ev.get("value_at_claim") - evidence_items.append(EvidenceItem(path=path, reason=reason, value_at_claim=value_at_claim)) - if evidence_items: - claims.append(ClaimItem(id=claim_id, claim=claim_text, evidence=evidence_items)) - return ConversationState( - updated_at=float(payload.get("updated_at") or time.monotonic()), - claims=claims, - snapshot_id=payload.get("snapshot_id"), - snapshot=payload.get("snapshot"), - ) - - -def _factsheet_kb_chars(mode: str, default_chars: int) -> int: - if mode == "genius": - return min(max(default_chars, 4000), 6000) - if mode == "smart": - return min(max(default_chars, 3000), 4500) - return max(1200, default_chars) - - -def _factsheet_line_limit(mode: str) -> int: - if mode == "genius": - return 30 - if mode == "smart": - return 22 - return 14 - - -def _factsheet_instruction(mode: str) -> str: - if mode == "genius": - return ( - "Start with a direct conclusion, then include the strongest supporting facts and one caveat. " - "Keep it to 4-8 sentences. If data is missing, name the missing metric explicitly." - ) - if mode == "smart": - return ( - "Start with a direct conclusion and support it with key facts. Keep it to 2-5 sentences. " - "If data is missing, say exactly what is missing and suggest atlas-genius." - ) - return "Keep it to 1-3 sentences. If key data is missing, say what is missing and suggest atlas-smart." - - -def _factsheet_model(mode: str, plan: ModePlan) -> str: - if mode in {"quick", "fast"}: - return plan.fast_model - return plan.model - - -def _is_plain_math_question(question: str) -> bool: - lowered = question.lower().strip() - if not lowered: - return False - cluster_markers = ( - "titan", - "atlas", - "cluster", - "node", - "pod", - "namespace", - "workload", - "grafana", - "alert", - "k8s", - "kubernetes", - "rpi", - "longhorn", - "postgres", - "victoria", - "ollama", - ) - if any(token in lowered for token in cluster_markers): - return False - if re.fullmatch(r"[0-9\s+\-*/().=]+", lowered): - return True - if re.search(r"\bwhat(?:'s| is)\s+\d+\s*[-+*/]\s*\d+\b", lowered): - return True - return False - - -def _quick_fact_sheet_lines( - question: str, - summary_lines: list[str], - kb_lines: list[str], - *, - limit: int, -) -> list[str]: - tokens = { - token - for token in re.findall(r"[a-z0-9][a-z0-9_-]{2,}", question.lower()) - if token not in GENERIC_METRIC_TOKENS - } - priority_markers = ( - "snapshot:", - "nodes_total", - "nodes_ready", - "nodes_not_ready", - "workers_ready", - "workers_not_ready", - "control_plane", - "worker_nodes", - "hottest", - "postgres", - "pods", - "longhorn", - "titan-", - "rpi5", - "rpi4", - "jetson", - "amd64", - ) - scored: list[tuple[int, str]] = [] - for raw in summary_lines: - line = raw.strip() - if not line: - continue - lowered = line.lower() - score = 0 - if any(marker in lowered for marker in priority_markers): - score += 4 - overlap = sum(1 for token in tokens if token in lowered) - score += overlap * 3 - if len(line) <= 180: - score += 1 - if score > 0: - scored.append((score, line)) - - scored.sort(key=lambda item: item[0], reverse=True) - selected = [line for _, line in scored[:limit]] - if not selected: - selected = [line.strip() for line in summary_lines if line.strip()][:limit] - - kb_selected: list[str] = [] - for raw in kb_lines: - line = raw.strip() - if not line or len(line) > 220: - continue - lowered = line.lower() - if "kb file:" in lowered or "kb: atlas.json" in lowered: - continue - overlap = sum(1 for token in tokens if token in lowered) - if overlap > 0: - kb_selected.append(line) - elif any(marker in lowered for marker in ("runbook", "titan-", "rpi5", "rpi4", "amd64", "jetson")): - kb_selected.append(line) - if len(kb_selected) >= max(4, limit // 3): - break - - merged = [] - seen: set[str] = set() - for line in selected + kb_selected: - if line not in seen: - seen.add(line) - merged.append(line) - if len(merged) >= limit: - break - return merged - - -def _quick_fact_sheet_text(lines: list[str]) -> str: - if not lines: - return "Fact Sheet:\n- No snapshot facts available." - body = "\n".join([f"- {line}" for line in lines]) - return "Fact Sheet:\n" + body - - -def _quick_fact_sheet_heuristic_answer(question: str, fact_lines: list[str]) -> str: - lowered = question.lower() - if ( - any(token in lowered for token in ("placement", "schedule", "last resort", "last-resort")) - and any(token in lowered for token in ("node", "workload", "worker", "titan")) - ): - return ( - "General workload placement is: prefer rpi5 workers first, then rpi4 workers. " - "titan-22 is the last-resort general compute node, and titan-24 is the absolute last resort " - "reserved for heavy one-offs." - ) - - for line in fact_lines: - compact = line.replace(" ", "") - match = re.search(r"nodes_total[:=](\d+),ready[:=](\d+),not_ready[:=](\d+)", compact) - if not match: - continue - total = match.group(1) - ready = match.group(2) - not_ready = match.group(3) - if "how many" in lowered and "ready" in lowered and "node" in lowered: - return f"The latest snapshot shows {ready} ready nodes out of {total} total ({not_ready} not ready)." - if ("not ready" in lowered or "unready" in lowered) and "node" in lowered: - return f"The latest snapshot shows {not_ready} not-ready nodes ({ready} ready out of {total} total)." - return "" - - -def _json_excerpt(summary: dict[str, Any], max_chars: int = 12000) -> str: - raw = json.dumps(summary, ensure_ascii=False) - return raw[:max_chars] diff --git a/atlasbot/engine/answerer/__init__.py b/atlasbot/engine/answerer/__init__.py new file mode 100644 index 0000000..676058c --- /dev/null +++ b/atlasbot/engine/answerer/__init__.py @@ -0,0 +1,12 @@ +"""Answer engine package.""" + +from ._base import * +from .common import * +from .engine import * +from .factsheet import * +from .post import * +from .post_ext import * +from .retrieval import * +from .retrieval_ext import * +from .spine import * +from .workflow import * diff --git a/atlasbot/engine/answerer/_base.py b/atlasbot/engine/answerer/_base.py new file mode 100644 index 0000000..30d0fdd --- /dev/null +++ b/atlasbot/engine/answerer/_base.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +import logging +from collections.abc import Awaitable, Callable +from dataclasses import dataclass +from typing import Any + +log = logging.getLogger(__name__) + +FOLLOWUP_SHORT_WORDS = 6 +TOKEN_MIN_LEN = 3 +GENERIC_METRIC_TOKENS = {"atlas", "cluster", "kubernetes", "k8s", "titan", "lab"} +NS_ENTRY_MIN_LEN = 2 +DEDUP_MIN_SENTENCES = 3 +RUNBOOK_SIMILARITY_THRESHOLD = 0.4 +BYTES_KB = 1024 +BYTES_MB = 1024 * 1024 + + +class LLMLimitReached(RuntimeError): + pass + + +class LLMTimeBudgetExceeded(RuntimeError): + pass + + +@dataclass +class AnswerScores: + confidence: int + relevance: int + satisfaction: int + hallucination_risk: str + + +@dataclass +class AnswerResult: + reply: str + scores: AnswerScores + meta: dict[str, Any] + + +@dataclass(frozen=True) +class InsightGuardInput: + question: str + reply: str + classify: dict[str, Any] + context: str + plan: ModePlan + call_llm: Callable[..., Awaitable[str]] + facts: list[str] + + +@dataclass +class ContradictionContext: + call_llm: Callable[..., Awaitable[str]] + question: str + reply: str + facts: list[str] + plan: ModePlan + + +@dataclass +class EvidenceItem: + path: str + reason: str + value: Any | None = None + value_at_claim: Any | None = None + + +@dataclass +class ClaimItem: + id: str + claim: str + evidence: list[EvidenceItem] + + +@dataclass +class ConversationState: + updated_at: float + claims: list[ClaimItem] + snapshot_id: str | None = None + snapshot: dict[str, Any] | None = None + + +@dataclass +class ModePlan: + model: str + fast_model: str + max_subquestions: int + chunk_lines: int + chunk_top: int + chunk_group: int + kb_max_chars: int + kb_max_files: int + use_raw_snapshot: bool + parallelism: int + score_retries: int + use_deep_retrieval: bool + use_tool: bool + use_critic: bool + use_gap: bool + use_scores: bool + drafts: int + metric_retries: int + subanswer_retries: int + + +@dataclass +class ScoreContext: + question: str + sub_questions: list[str] + retries: int + parallelism: int + select_best: bool + fast_model: str diff --git a/atlasbot/engine/answerer/common.py b/atlasbot/engine/answerer/common.py new file mode 100644 index 0000000..ce09fde --- /dev/null +++ b/atlasbot/engine/answerer/common.py @@ -0,0 +1,395 @@ +from __future__ import annotations + +import json +import time +from collections.abc import Awaitable, Callable +from typing import Any + +from atlasbot.config import Settings +from atlasbot.llm import prompts +from atlasbot.llm.client import parse_json + +from ._base import * +from .factsheet import * +from .post import * +from .post_ext import * +from .retrieval import _gather_limited +from .retrieval_ext import * +from .spine import * + + +def _strip_followup_meta(reply: str) -> str: + cleaned = reply.strip() + if not cleaned: + return cleaned + prefixes = [ + "The draft is correct based on the provided context.", + "The draft is correct based on the context.", + "The draft is correct based on the provided evidence.", + "The draft is correct.", + "Based on the provided context,", + "Based on the context,", + "Based on the provided evidence,", + ] + for prefix in prefixes: + if cleaned.lower().startswith(prefix.lower()): + cleaned = cleaned[len(prefix) :].lstrip(" .") + break + return cleaned + + +def _build_meta(mode: str, call_count: int, call_cap: int, limit_hit: bool, time_budget_hit: bool, time_budget_sec: float, classify: dict[str, Any], tool_hint: dict[str, Any] | None, started: float) -> dict[str, Any]: + return { + "mode": mode, + "llm_calls": call_count, + "llm_limit": call_cap, + "llm_limit_hit": limit_hit, + "time_budget_sec": time_budget_sec, + "time_budget_hit": time_budget_hit, + "classify": classify, + "tool_hint": tool_hint, + "elapsed_sec": round(time.monotonic() - started, 2), + } + + +def _debug_pipeline_log(settings: Settings, name: str, payload: Any) -> None: + """Write a structured debug event when pipeline tracing is enabled.""" + + if not settings.debug_pipeline: + return + log.info("atlasbot_debug", extra={"extra": {"name": name, "payload": payload}}) + + +def _mode_plan(settings: Settings, mode: str) -> ModePlan: + if mode == "genius": + return ModePlan( + model=settings.ollama_model_genius, + fast_model=settings.ollama_model_fast, + max_subquestions=6, + chunk_lines=6, + chunk_top=10, + chunk_group=4, + kb_max_chars=200000, + kb_max_files=200, + use_raw_snapshot=True, + parallelism=4, + score_retries=3, + use_deep_retrieval=True, + use_tool=True, + use_critic=True, + use_gap=True, + use_scores=True, + drafts=2, + metric_retries=3, + subanswer_retries=3, + ) + if mode == "smart": + return ModePlan( + model=settings.ollama_model_smart, + fast_model=settings.ollama_model_fast, + max_subquestions=4, + chunk_lines=8, + chunk_top=8, + chunk_group=4, + kb_max_chars=3000, + kb_max_files=12, + use_raw_snapshot=False, + parallelism=2, + score_retries=2, + use_deep_retrieval=True, + use_tool=True, + use_critic=True, + use_gap=True, + use_scores=True, + drafts=1, + metric_retries=2, + subanswer_retries=2, + ) + return ModePlan( + model=settings.ollama_model_fast, + fast_model=settings.ollama_model_fast, + max_subquestions=1, + chunk_lines=16, + chunk_top=3, + chunk_group=5, + kb_max_chars=800, + kb_max_files=4, + use_raw_snapshot=False, + parallelism=1, + score_retries=1, + use_deep_retrieval=False, + use_tool=False, + use_critic=False, + use_gap=False, + use_scores=False, + drafts=1, + metric_retries=1, + subanswer_retries=1, + ) + + +def _llm_call_limit(settings: Settings, mode: str) -> int: + if mode == "genius": + return settings.genius_llm_calls_max + if mode == "smart": + return settings.smart_llm_calls_max + return settings.fast_llm_calls_max + + +def _mode_time_budget(settings: Settings, mode: str) -> float: + if mode == "genius": + return max(0.0, settings.genius_time_budget_sec) + if mode == "smart": + return max(0.0, settings.smart_time_budget_sec) + return max(0.0, settings.quick_time_budget_sec) + + +def _select_subquestions(parts: list[dict[str, Any]], fallback: str, limit: int) -> list[str]: + if not parts: + return [fallback] + ranked = [] + for entry in parts: + if not isinstance(entry, dict): + continue + question = str(entry.get("question") or "").strip() + if not question: + continue + priority = entry.get("priority") + try: + weight = float(priority) + except (TypeError, ValueError): + weight = 1.0 + ranked.append((weight, question)) + ranked.sort(key=lambda item: item[0], reverse=True) + questions = [item[1] for item in ranked][:limit] + return questions or [fallback] + + +def _chunk_lines(lines: list[str], lines_per_chunk: int) -> list[dict[str, Any]]: + chunks: list[dict[str, Any]] = [] + if not lines: + return chunks + for idx in range(0, len(lines), lines_per_chunk): + chunk_lines = lines[idx : idx + lines_per_chunk] + text = "\n".join(chunk_lines) + summary = " | ".join(chunk_lines[:4]) + chunks.append({"id": f"c{idx//lines_per_chunk}", "text": text, "summary": summary}) + return chunks + + +def _raw_snapshot_chunks(snapshot: dict[str, Any] | None) -> list[dict[str, Any]]: + if not isinstance(snapshot, dict) or not snapshot: + return [] + chunks: list[dict[str, Any]] = [] + for key, value in snapshot.items(): + try: + payload = json.dumps({key: value}, indent=2) + except Exception: + continue + summary = f"raw:{key}" + chunks.append({"id": f"r{key}", "text": payload, "summary": summary}) + return chunks + + +def _build_chunk_groups(chunks: list[dict[str, Any]], group_size: int) -> list[list[dict[str, Any]]]: + groups: list[list[dict[str, Any]]] = [] + group: list[dict[str, Any]] = [] + for chunk in chunks: + group.append({"id": chunk["id"], "summary": chunk["summary"]}) + if len(group) >= group_size: + groups.append(group) + group = [] + if group: + groups.append(group) + return groups + + +async def _score_chunks(call_llm: Callable[..., Any], chunks: list[dict[str, Any]], question: str, sub_questions: list[str], plan: ModePlan) -> dict[str, float]: + scores: dict[str, float] = {chunk["id"]: 0.0 for chunk in chunks} + if not chunks: + return scores + groups = _build_chunk_groups(chunks, plan.chunk_group) + ctx = ScoreContext( + question=question, + sub_questions=sub_questions, + retries=max(1, plan.score_retries), + parallelism=plan.parallelism, + select_best=plan.score_retries > 1, + fast_model=plan.fast_model, + ) + if ctx.parallelism <= 1 or len(groups) * ctx.retries <= 1: + return await _score_groups_serial(call_llm, groups, ctx) + return await _score_groups_parallel(call_llm, groups, ctx) + + +async def _score_groups_serial(call_llm: Callable[..., Any], groups: list[list[dict[str, Any]]], ctx: ScoreContext) -> dict[str, float]: + scores: dict[str, float] = {} + for grp in groups: + runs = [await _score_chunk_group(call_llm, grp, ctx.question, ctx.sub_questions) for _ in range(ctx.retries)] + if ctx.select_best and len(runs) > 1: + best = await _select_best_score_run(call_llm, grp, runs, ctx) + scores.update(best) + else: + scores.update(_merge_score_runs(runs)) + return scores + + +async def _score_groups_parallel(call_llm: Callable[..., Any], groups: list[list[dict[str, Any]]], ctx: ScoreContext) -> dict[str, float]: + coros: list[Awaitable[tuple[int, dict[str, float]]]] = [] + for idx, grp in enumerate(groups): + for _ in range(ctx.retries): + coros.append(_score_chunk_group_run(call_llm, idx, grp, ctx.question, ctx.sub_questions)) + results = await _gather_limited(coros, ctx.parallelism) + grouped: dict[int, list[dict[str, float]]] = {} + for idx, result in results: + grouped.setdefault(idx, []).append(result) + scores: dict[str, float] = {} + for idx, runs in grouped.items(): + if ctx.select_best and len(runs) > 1: + group = groups[idx] + best = await _select_best_score_run(call_llm, group, runs, ctx) + scores.update(best) + else: + scores.update(_merge_score_runs(runs)) + return scores + + +async def _score_chunk_group(call_llm: Callable[..., Any], group: list[dict[str, Any]], question: str, sub_questions: list[str]) -> dict[str, float]: + prompt = ( + prompts.CHUNK_SCORE_PROMPT + + "\nQuestion: " + + question + + "\nSubQuestions: " + + json.dumps(sub_questions) + + "\nChunks: " + + json.dumps(group) + ) + raw = await call_llm(prompts.RETRIEVER_SYSTEM, prompt, model=None, tag="chunk_score") + data = _parse_json_list(raw) + scored: dict[str, float] = {} + for entry in data: + if not isinstance(entry, dict): + continue + cid = str(entry.get("id") or "").strip() + if not cid: + continue + try: + score = float(entry.get("score") or 0) + except (TypeError, ValueError): + score = 0.0 + scored[cid] = score + return scored + + +async def _score_chunk_group_run(call_llm: Callable[..., Any], idx: int, group: list[dict[str, Any]], question: str, sub_questions: list[str]) -> tuple[int, dict[str, float]]: + return idx, await _score_chunk_group(call_llm, group, question, sub_questions) + + +def _merge_score_runs(runs: list[dict[str, float]]) -> dict[str, float]: + if not runs: + return {} + totals: dict[str, float] = {} + counts: dict[str, int] = {} + for run in runs: + for key, value in run.items(): + totals[key] = totals.get(key, 0.0) + float(value) + counts[key] = counts.get(key, 0) + 1 + return {key: totals[key] / counts[key] for key in totals} + + +async def _select_best_score_run(call_llm: Callable[..., Any], group: list[dict[str, Any]], runs: list[dict[str, float]], ctx: ScoreContext) -> dict[str, float]: + if not runs: + return {} + prompt = ( + prompts.RETRIEVER_SELECT_PROMPT + + "\nQuestion: " + + ctx.question + + "\nSubQuestions: " + + json.dumps(ctx.sub_questions) + + "\nChunks: " + + json.dumps(group) + + "\nRuns: " + + json.dumps(runs) + ) + raw = await call_llm(prompts.RETRIEVER_SELECT_SYSTEM, prompt, model=ctx.fast_model, tag="chunk_select") + data = parse_json(raw) + idx = 0 + if isinstance(data, dict): + try: + idx = int(data.get("selected_index") or 0) + except (TypeError, ValueError): + idx = 0 + if idx < 0 or idx >= len(runs): + idx = 0 + return runs[idx] + + +def _keyword_hits(ranked: list[dict[str, Any]], head: dict[str, Any], keywords: list[str] | None) -> list[dict[str, Any]]: + if not keywords: + return [] + lowered = [kw.lower() for kw in keywords if isinstance(kw, str) and kw.strip()] + if not lowered: + return [] + hits: list[dict[str, Any]] = [] + for item in ranked: + if item is head: + continue + text = str(item.get("text") or "").lower() + if any(kw in text for kw in lowered): + hits.append(item) + return hits + + +def _select_chunks(chunks: list[dict[str, Any]], scores: dict[str, float], plan: ModePlan, keywords: list[str] | None = None, must_ids: list[str] | None = None) -> list[dict[str, Any]]: + if not chunks: + return [] + ranked = sorted(chunks, key=lambda item: scores.get(item["id"], 0.0), reverse=True) + selected: list[dict[str, Any]] = [chunks[0]] + if _append_must_chunks(chunks, selected, must_ids, plan.chunk_top): + return selected + if _append_keyword_chunks(ranked, selected, keywords, plan.chunk_top): + return selected + _append_ranked_chunks(ranked, selected, plan.chunk_top) + return selected + + +def _append_must_chunks(chunks: list[dict[str, Any]], selected: list[dict[str, Any]], must_ids: list[str] | None, limit: int) -> bool: + if not must_ids: + return False + id_map = {item["id"]: item for item in chunks} + for cid in must_ids: + item = id_map.get(cid) + if item and item not in selected: + selected.append(item) + if len(selected) >= limit: + return True + return False + + +def _append_keyword_chunks(ranked: list[dict[str, Any]], selected: list[dict[str, Any]], keywords: list[str] | None, limit: int) -> bool: + if not ranked: + return False + head = ranked[0] + for item in _keyword_hits(ranked, head, keywords): + if item not in selected: + selected.append(item) + if len(selected) >= limit: + return True + return False + + +def _append_ranked_chunks(ranked: list[dict[str, Any]], selected: list[dict[str, Any]], limit: int) -> None: + for item in ranked: + if len(selected) >= limit: + break + if item not in selected: + selected.append(item) + + +def _format_runbooks(runbooks: list[str]) -> str: + if not runbooks: + return "" + return "Relevant runbooks:\n" + "\n".join([f"- {item}" for item in runbooks]) + + +__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")] diff --git a/atlasbot/engine/answerer/engine.py b/atlasbot/engine/answerer/engine.py new file mode 100644 index 0000000..d01b925 --- /dev/null +++ b/atlasbot/engine/answerer/engine.py @@ -0,0 +1,267 @@ +from __future__ import annotations + +from collections.abc import Callable +import json +import time +from typing import Any + +from atlasbot.config import Settings +from atlasbot.knowledge.loader import KnowledgeBase +from atlasbot.llm import prompts +from atlasbot.llm.client import LLMClient, build_messages +from atlasbot.snapshot.builder import SnapshotProvider +from atlasbot.state.store import ClaimStore + +from ._base import * +from .common import * +from .factsheet import * +from .post import * +from .post_ext import * +from .retrieval import * +from .retrieval_ext import * +from .spine import * +from .workflow import run_answer + + +class AnswerEngine: + """Coordinate Atlas question answering across snapshots, KB, and LLMs. + + Why: + - keep the public answer surface in one place while the retrieval and + post-processing helpers stay split across smaller modules. + """ + + def __init__(self, settings: Settings, llm: LLMClient, kb: KnowledgeBase, snapshot: SnapshotProvider) -> None: + self._settings = settings + self._llm = llm + self._kb = kb + self._snapshot = snapshot + self._store = ClaimStore(settings.state_db_path, settings.conversation_ttl_sec) + + async def answer(self, question: str, *, mode: str, history: list[dict[str, str]] | None = None, observer: Callable[[str, str], None] | None = None, conversation_id: str | None = None, snapshot_pin: bool | None = None) -> AnswerResult: + """Answer a question by delegating to the staged workflow.""" + + return await run_answer( + self, + question, + mode=mode, + history=history, + observer=observer, + conversation_id=conversation_id, + snapshot_pin=snapshot_pin, + ) + + async def _answer_stock(self, question: str) -> AnswerResult: + messages = build_messages(prompts.STOCK_SYSTEM, question) + reply = await self._llm.chat(messages, model=self._settings.ollama_model) + return AnswerResult(reply, _default_scores(), {"mode": "stock"}) + + async def _synthesize_answer(self, question: str, subanswers: list[str], context: str, classify: dict[str, Any], plan: ModePlan, call_llm: Callable[..., Any]) -> str: + style_hint = _style_hint(classify) + if not subanswers: + prompt = ( + prompts.SYNTHESIZE_PROMPT + + "\nQuestion: " + + question + + "\nStyle: " + + style_hint + + "\nQuestionType: " + + (classify.get("question_type") or "unknown") + ) + return await call_llm(prompts.SYNTHESIZE_SYSTEM, prompt, context=context, model=plan.model, tag="synth") + draft_prompts = [] + for idx in range(plan.drafts): + draft_prompts.append( + prompts.SYNTHESIZE_PROMPT + + "\nQuestion: " + + question + + "\nStyle: " + + style_hint + + "\nQuestionType: " + + (classify.get("question_type") or "unknown") + + "\nSubanswers:\n" + + "\n".join([f"- {item}" for item in subanswers]) + + f"\nDraftIndex: {idx + 1}" + ) + drafts: list[str] = [] + if plan.parallelism > 1 and len(draft_prompts) > 1: + drafts = await _gather_limited( + [ + call_llm( + prompts.SYNTHESIZE_SYSTEM, + prompt, + context=context, + model=plan.model, + tag="synth", + ) + for prompt in draft_prompts + ], + plan.parallelism, + ) + else: + for prompt in draft_prompts: + drafts.append( + await call_llm( + prompts.SYNTHESIZE_SYSTEM, + prompt, + context=context, + model=plan.model, + tag="synth", + ) + ) + if len(drafts) == 1: + return drafts[0] + select_prompt = ( + prompts.DRAFT_SELECT_PROMPT + + "\nQuestion: " + + question + + "\nDrafts:\n" + + "\n\n".join([f"Draft {idx + 1}: {text}" for idx, text in enumerate(drafts)]) + ) + select_raw = await call_llm(prompts.CRITIC_SYSTEM, select_prompt, context=context, model=plan.fast_model, tag="draft_select") + selection = _parse_json_block(select_raw, fallback={}) + idx = int(selection.get("best", 1)) - 1 + if 0 <= idx < len(drafts): + return drafts[idx] + return drafts[0] + + async def _score_answer(self, question: str, reply: str, plan: ModePlan, call_llm: Callable[..., Any]) -> AnswerScores: + if not plan.use_scores: + return _default_scores() + prompt = prompts.SCORE_PROMPT + "\nQuestion: " + question + "\nAnswer: " + reply + raw = await call_llm(prompts.SCORE_SYSTEM, prompt, model=plan.fast_model, tag="score") + data = _parse_json_block(raw, fallback={}) + return _scores_from_json(data) + + async def _extract_claims(self, question: str, reply: str, summary: dict[str, Any], facts_used: list[str], call_llm: Callable[..., Any]) -> list[ClaimItem]: + if not reply or not summary: + return [] + summary_json = _json_excerpt(summary) + facts_used = [line.strip() for line in (facts_used or []) if line and line.strip()] + facts_block = "" + if facts_used: + facts_block = "\nFactsUsed:\n" + "\n".join([f"- {line}" for line in facts_used[:12]]) + prompt = prompts.CLAIM_MAP_PROMPT + "\nQuestion: " + question + "\nAnswer: " + reply + facts_block + raw = await call_llm( + prompts.CLAIM_SYSTEM, + prompt, + context=f"SnapshotSummaryJson:{summary_json}", + model=self._settings.ollama_model_fast, + tag="claim_map", + ) + data = _parse_json_block(raw, fallback={}) + claims_raw = data.get("claims") if isinstance(data, dict) else None + claims: list[ClaimItem] = [] + if isinstance(claims_raw, list): + for entry in claims_raw: + if not isinstance(entry, dict): + continue + claim_text = str(entry.get("claim") or "").strip() + claim_id = str(entry.get("id") or "").strip() or f"c{len(claims)+1}" + evidence_items: list[EvidenceItem] = [] + for ev in entry.get("evidence") or []: + if not isinstance(ev, dict): + continue + path = str(ev.get("path") or "").strip() + if not path: + continue + reason = str(ev.get("reason") or "").strip() + value = _resolve_path(summary, path) + evidence_items.append(EvidenceItem(path=path, reason=reason, value=value, value_at_claim=value)) + if claim_text and evidence_items: + claims.append(ClaimItem(id=claim_id, claim=claim_text, evidence=evidence_items)) + return claims + + async def _dedup_reply(self, reply: str, plan: ModePlan, call_llm: Callable[..., Any], tag: str) -> str: + if not _needs_dedup(reply): + return reply + dedup_prompt = prompts.DEDUP_PROMPT + "\nDraft: " + reply + return await call_llm(prompts.DEDUP_SYSTEM, dedup_prompt, model=plan.fast_model, tag=tag) + + async def _answer_followup(self, question: str, state: ConversationState, summary: dict[str, Any], classify: dict[str, Any], plan: ModePlan, call_llm: Callable[..., Any]) -> str: # noqa: C901, ARG002 + claim_ids = await self._select_claims(question, state.claims, plan, call_llm) + selected = [claim for claim in state.claims if claim.id in claim_ids] if claim_ids else state.claims[:2] + evidence_lines = [] + lowered = question.lower() + for claim in selected: + evidence_lines.append(f"Claim: {claim.claim}") + for ev in claim.evidence: + current = _resolve_path(summary, ev.path) + ev.value = current + delta_note = "" + if ev.value_at_claim is not None and current is not None and current != ev.value_at_claim: + delta_note = f" (now {current})" + evidence_lines.append(f"- {ev.path}: {ev.value_at_claim}{delta_note}") + if any(term in lowered for term in ("hotspot", "hot spot", "hottest", "jetson", "rpi", "amd64", "arm64", "hardware", "class")): + hotspot_lines = _hotspot_evidence(summary) + if hotspot_lines: + evidence_lines.append("HotspotSummary:") + evidence_lines.extend(hotspot_lines) + evidence_ctx = "\n".join(evidence_lines) + prompt = prompts.FOLLOWUP_PROMPT + "\nFollow-up: " + question + "\nEvidence:\n" + evidence_ctx + reply = await call_llm(prompts.FOLLOWUP_SYSTEM, prompt, model=plan.model, tag="followup") + allowed_nodes = _allowed_nodes(summary) + allowed_namespaces = _allowed_namespaces(summary) + unknown_nodes = _find_unknown_nodes(reply, allowed_nodes) + unknown_namespaces = _find_unknown_namespaces(reply, allowed_namespaces) + extra_bits = [] + if unknown_nodes: + extra_bits.append("UnknownNodes: " + ", ".join(sorted(unknown_nodes))) + if unknown_namespaces: + extra_bits.append("UnknownNamespaces: " + ", ".join(sorted(unknown_namespaces))) + if allowed_nodes: + extra_bits.append("AllowedNodes: " + ", ".join(allowed_nodes)) + if allowed_namespaces: + extra_bits.append("AllowedNamespaces: " + ", ".join(allowed_namespaces)) + if extra_bits: + fix_prompt = ( + prompts.EVIDENCE_FIX_PROMPT + + "\nQuestion: " + + question + + "\nDraft: " + + reply + + "\n" + + "\n".join(extra_bits) + ) + reply = await call_llm( + prompts.EVIDENCE_FIX_SYSTEM, + fix_prompt, + context="Evidence:\n" + evidence_ctx, + model=plan.model, + tag="followup_fix", + ) + reply = await self._dedup_reply(reply, plan, call_llm, tag="dedup_followup") + reply = _strip_followup_meta(reply) + return reply + + async def _select_claims(self, question: str, claims: list[ClaimItem], plan: ModePlan, call_llm: Callable[..., Any]) -> list[str]: + if not claims: + return [] + claims_brief = [{"id": claim.id, "claim": claim.claim} for claim in claims] + prompt = prompts.SELECT_CLAIMS_PROMPT + "\nFollow-up: " + question + "\nClaims: " + json.dumps(claims_brief) + raw = await call_llm(prompts.FOLLOWUP_SYSTEM, prompt, model=plan.fast_model, tag="select_claims") + data = _parse_json_block(raw, fallback={}) + ids = data.get("claim_ids") if isinstance(data, dict) else [] + if isinstance(ids, list): + return [str(item) for item in ids if item] + return [] + + def _get_state(self, conversation_id: str | None) -> ConversationState | None: + if not conversation_id: + return None + state_payload = self._store.get(conversation_id) + return _state_from_payload(state_payload) if state_payload else None + + def _store_state(self, conversation_id: str, claims: list[ClaimItem], summary: dict[str, Any], snapshot: dict[str, Any] | None, pin_snapshot: bool) -> None: + snapshot_id = _snapshot_id(summary) + pinned_snapshot = snapshot if pin_snapshot else None + payload = { + "updated_at": time.monotonic(), + "claims": _claims_to_payload(claims), + "snapshot_id": snapshot_id, + "snapshot": pinned_snapshot, + } + self._store.set(conversation_id, payload) + + def _cleanup_state(self) -> None: + self._store.cleanup() diff --git a/atlasbot/engine/answerer/factsheet.py b/atlasbot/engine/answerer/factsheet.py new file mode 100644 index 0000000..089b955 --- /dev/null +++ b/atlasbot/engine/answerer/factsheet.py @@ -0,0 +1,189 @@ +from __future__ import annotations + +import json +import re +from typing import Any + +from ._base import * + +MAX_FACT_LINE_CHARS = 180 +MAX_KB_LINE_CHARS = 220 + + +def _factsheet_kb_chars(mode: str, default_chars: int) -> int: + if mode == "genius": + return min(max(default_chars, 4000), 6000) + if mode == "smart": + return min(max(default_chars, 3000), 4500) + return max(1200, default_chars) + + +def _factsheet_line_limit(mode: str) -> int: + if mode == "genius": + return 30 + if mode == "smart": + return 22 + return 14 + + +def _factsheet_instruction(mode: str) -> str: + if mode == "genius": + return ( + "Start with a direct conclusion, then include the strongest supporting facts and one caveat. " + "Keep it to 4-8 sentences. If data is missing, name the missing metric explicitly." + ) + if mode == "smart": + return ( + "Start with a direct conclusion and support it with key facts. Keep it to 2-5 sentences. " + "If data is missing, say exactly what is missing and suggest atlas-genius." + ) + return "Keep it to 1-3 sentences. If key data is missing, say what is missing and suggest atlas-smart." + + +def _factsheet_model(mode: str, plan: ModePlan) -> str: + if mode in {"quick", "fast"}: + return plan.fast_model + return plan.model + + +def _is_plain_math_question(question: str) -> bool: + lowered = question.lower().strip() + if not lowered: + return False + cluster_markers = ( + "titan", + "atlas", + "cluster", + "node", + "pod", + "namespace", + "workload", + "grafana", + "alert", + "k8s", + "kubernetes", + "rpi", + "longhorn", + "postgres", + "victoria", + "ollama", + ) + if any(token in lowered for token in cluster_markers): + return False + return bool( + re.fullmatch(r"[0-9\s+\-*/().=]+", lowered) + or re.search(r"\bwhat(?:'s| is)\s+\d+\s*[-+*/]\s*\d+\b", lowered) + ) + + +def _quick_fact_sheet_lines(question: str, summary_lines: list[str], kb_lines: list[str], *, limit: int) -> list[str]: # noqa: C901 + tokens = { + token + for token in re.findall(r"[a-z0-9][a-z0-9_-]{2,}", question.lower()) + if token not in GENERIC_METRIC_TOKENS + } + priority_markers = ( + "snapshot:", + "nodes_total", + "nodes_ready", + "nodes_not_ready", + "workers_ready", + "workers_not_ready", + "control_plane", + "worker_nodes", + "hottest", + "postgres", + "pods", + "longhorn", + "titan-", + "rpi5", + "rpi4", + "jetson", + "amd64", + ) + scored: list[tuple[int, str]] = [] + for raw in summary_lines: + line = raw.strip() + if not line: + continue + lowered = line.lower() + score = 0 + if any(marker in lowered for marker in priority_markers): + score += 4 + overlap = sum(1 for token in tokens if token in lowered) + score += overlap * 3 + if len(line) <= MAX_FACT_LINE_CHARS: + score += 1 + if score > 0: + scored.append((score, line)) + + scored.sort(key=lambda item: item[0], reverse=True) + selected = [line for _, line in scored[:limit]] + if not selected: + selected = [line.strip() for line in summary_lines if line.strip()][:limit] + + kb_selected: list[str] = [] + for raw in kb_lines: + line = raw.strip() + if not line or len(line) > MAX_KB_LINE_CHARS: + continue + lowered = line.lower() + if "kb file:" in lowered or "kb: atlas.json" in lowered: + continue + overlap = sum(1 for token in tokens if token in lowered) + if overlap > 0 or any(marker in lowered for marker in ("runbook", "titan-", "rpi5", "rpi4", "amd64", "jetson")): + kb_selected.append(line) + if len(kb_selected) >= max(4, limit // 3): + break + + merged = [] + seen: set[str] = set() + for line in selected + kb_selected: + if line not in seen: + seen.add(line) + merged.append(line) + if len(merged) >= limit: + break + return merged + + +def _quick_fact_sheet_text(lines: list[str]) -> str: + if not lines: + return "Fact Sheet:\n- No snapshot facts available." + body = "\n".join([f"- {line}" for line in lines]) + return "Fact Sheet:\n" + body + + +def _quick_fact_sheet_heuristic_answer(question: str, fact_lines: list[str]) -> str: + lowered = question.lower() + if ( + any(token in lowered for token in ("placement", "schedule", "last resort", "last-resort")) + and any(token in lowered for token in ("node", "workload", "worker", "titan")) + ): + return ( + "General workload placement is: prefer rpi5 workers first, then rpi4 workers. " + "titan-22 is the last-resort general compute node, and titan-24 is the absolute last resort " + "reserved for heavy one-offs." + ) + + for line in fact_lines: + compact = line.replace(" ", "") + match = re.search(r"nodes_total[:=](\d+),ready[:=](\d+),not_ready[:=](\d+)", compact) + if not match: + continue + total = match.group(1) + ready = match.group(2) + not_ready = match.group(3) + if "how many" in lowered and "ready" in lowered and "node" in lowered: + return f"The latest snapshot shows {ready} ready nodes out of {total} total ({not_ready} not ready)." + if ("not ready" in lowered or "unready" in lowered) and "node" in lowered: + return f"The latest snapshot shows {not_ready} not-ready nodes ({ready} ready out of {total} total)." + return "" + + +def _json_excerpt(summary: dict[str, Any], max_chars: int = 12000) -> str: + raw = json.dumps(summary, ensure_ascii=False) + return raw[:max_chars] + + +__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")] diff --git a/atlasbot/engine/answerer/post.py b/atlasbot/engine/answerer/post.py new file mode 100644 index 0000000..a3d90f7 --- /dev/null +++ b/atlasbot/engine/answerer/post.py @@ -0,0 +1,459 @@ +from __future__ import annotations + +import re +from typing import Any + +from atlasbot.llm import prompts +from atlasbot.llm.client import parse_json + +from ._base import * +from .retrieval_ext import _dedupe_lines + + +def _merge_fact_lines(primary: list[str], fallback: list[str]) -> list[str]: + merged: list[str] = [] + for line in primary + fallback: + value = (line or "").strip() + if value and value not in merged: + merged.append(value) + return merged + + +def _strip_unknown_entities(reply: str, unknown_nodes: list[str], unknown_namespaces: list[str]) -> str: + if not reply: + return reply + if not unknown_nodes and not unknown_namespaces: + return reply + sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", reply) if s.strip()] + if not sentences: + return reply + lowered_nodes = [node.lower() for node in unknown_nodes] + lowered_namespaces = [ns.lower() for ns in unknown_namespaces] + kept: list[str] = [] + for sent in sentences: + lower = sent.lower() + if lowered_nodes and any(node in lower for node in lowered_nodes): + continue + if lowered_namespaces and any(f"namespace {ns}" in lower for ns in lowered_namespaces): + continue + kept.append(sent) + cleaned = " ".join(kept).strip() + return cleaned or reply + + +def _needs_evidence_guard(reply: str, facts: list[str]) -> bool: + if not reply or not facts: + return False + lower_reply = reply.lower() + fact_text = " ".join(facts).lower() + node_pattern = re.compile(r"\b(titan-[0-9a-z]+|node-?\d+)\b", re.IGNORECASE) + nodes = {m.group(1).lower() for m in node_pattern.finditer(reply)} + if nodes: + missing = [node for node in nodes if node not in fact_text] + if missing: + return True + pressure_terms = ("pressure", "diskpressure", "memorypressure", "pidpressure", "headroom") + if any(term in lower_reply for term in pressure_terms) and not any(term in fact_text for term in pressure_terms): + return True + arch_terms = ("amd64", "arm64", "rpi", "rpi4", "rpi5", "jetson") + return any(term in lower_reply for term in arch_terms) and not any(term in fact_text for term in arch_terms) + + +async def _contradiction_decision(ctx: ContradictionContext, attempts: int = 1) -> dict[str, Any]: + best = {"use_facts": True, "confidence": 50} + facts_block = "\n".join(ctx.facts[:12]) + for idx in range(max(1, attempts)): + variant = f"Variant: {idx + 1}" if attempts > 1 else "" + prompt = ( + prompts.CONTRADICTION_PROMPT.format(question=ctx.question, draft=ctx.reply, facts=facts_block) + + ("\n" + variant if variant else "") + ) + raw = await ctx.call_llm( + prompts.CONTRADICTION_SYSTEM, + prompt, + model=ctx.plan.fast_model, + tag="contradiction", + ) + data = _parse_json_block(raw, fallback={}) + try: + confidence = int(data.get("confidence", 50)) + except Exception: + confidence = 50 + use_facts = bool(data.get("use_facts", True)) + if confidence >= best.get("confidence", 0): + best = {"use_facts": use_facts, "confidence": confidence} + return best + + +def _filter_lines_by_keywords(lines: list[str], keywords: list[str], max_lines: int) -> list[str]: + if not lines: + return [] + tokens = _expand_tokens(keywords) + if not tokens: + return lines[:max_lines] + filtered = [line for line in lines if any(tok in line.lower() for tok in tokens)] + return (filtered or lines)[:max_lines] + + +def _rank_metric_lines(lines: list[str], tokens: set[str], max_lines: int) -> list[str]: + if not lines or not tokens: + return [] + ranked: list[tuple[int, int, str]] = [] + for line in lines: + lower = line.lower() + hits = sum(1 for tok in tokens if tok in lower) + if not hits: + continue + has_number = 1 if re.search(r"\d", line) else 0 + ranked.append((has_number, hits, line)) + ranked.sort(key=lambda item: (-item[0], -item[1], item[2])) + return [item[2] for item in ranked[:max_lines]] + + +def _select_metric_line(lines: list[str], question: str, tokens: list[str] | set[str]) -> str | None: + if not lines or not tokens: + return None + token_set = {str(tok).lower() for tok in tokens if tok} + ranked = _rank_metric_lines(lines, token_set, max_lines=6) + if not ranked: + return None + question_lower = (question or "").lower() + if any(term in question_lower for term in ("how many", "count", "total")): + for line in ranked: + lower = line.lower() + if "total" in lower or "count" in lower: + return line + return ranked[0] + + +def _format_direct_metric_line(line: str) -> str: + if not line: + return "" + if ":" in line: + formatted = _format_colon_metric(line) + if formatted: + return formatted + if "=" in line: + formatted = _format_equals_metric(line) + if formatted: + return formatted + return line + + +def _format_colon_metric(line: str) -> str | None: + key, value = line.split(":", 1) + key = key.strip().replace("_", " ") + value = value.strip() + if not value: + return None + if key == "nodes": + formatted = _format_nodes_value(value) + if formatted: + return formatted + if key in {"nodes total", "nodes_total"}: + return f"Atlas has {value} total nodes." + return f"{key} is {value}." + + +def _format_equals_metric(line: str) -> str | None: + pairs: list[str] = [] + for part in line.split(","): + if "=" not in part: + continue + key, value = part.split("=", 1) + key = key.strip().replace("_", " ") + value = value.strip() + if not value: + continue + if key in {"nodes total", "nodes_total"}: + return f"Atlas has {value} total nodes." + pairs.append(f"{key} is {value}") + if not pairs: + return None + if len(pairs) == 1: + return f"{pairs[0]}." + return "; ".join(pairs) + "." + + +def _format_nodes_value(value: str) -> str | None: + parts = [p.strip() for p in value.split(",") if p.strip()] + total = None + rest: list[str] = [] + for part in parts: + if part.startswith("total="): + total = part.split("=", 1)[1] + else: + rest.append(part.replace("_", " ")) + if not total: + return None + if rest: + return f"Atlas has {total} total nodes ({'; '.join(rest)})." + return f"Atlas has {total} total nodes." + + +def _global_facts(lines: list[str]) -> list[str]: + if not lines: + return [] + wanted = ("nodes_total", "nodes_ready", "cluster_name", "cluster", "nodes_not_ready") + facts: list[str] = [] + for line in lines: + lower = line.lower() + if any(key in lower for key in wanted): + facts.append(line) + return _dedupe_lines(facts, limit=6) + + +def _has_keyword_overlap(lines: list[str], keywords: list[str]) -> bool: + if not lines or not keywords: + return False + tokens = _expand_tokens(keywords) + if not tokens: + return False + for line in lines: + lower = line.lower() + if any(tok in lower for tok in tokens): + return True + return False + + +def _merge_tokens(primary: list[str], secondary: list[str], third: list[str] | None = None) -> list[str]: + merged: list[str] = [] + for token in primary + secondary + (third or []): + if not token: + continue + if token not in merged: + merged.append(token) + return merged + + +def _extract_question_tokens(question: str) -> list[str]: + if not question: + return [] + tokens: list[str] = [] + for part in re.split(r"[^a-zA-Z0-9_-]+", question.lower()): + if len(part) < TOKEN_MIN_LEN: + continue + if part not in tokens: + tokens.append(part) + return tokens + + +def _expand_tokens(tokens: list[str]) -> list[str]: + if not tokens: + return [] + expanded: list[str] = [] + for token in tokens: + if not isinstance(token, str): + continue + for part in re.split(r"[^a-zA-Z0-9_-]+", token.lower()): + if len(part) < TOKEN_MIN_LEN: + continue + if part not in expanded: + expanded.append(part) + return expanded + + +def _ensure_token_coverage(lines: list[str], tokens: list[str], summary_lines: list[str], max_add: int = 4) -> list[str]: + if not lines or not tokens or not summary_lines: + return lines + hay = " ".join(lines).lower() + missing = [tok for tok in tokens if tok and tok.lower() not in hay] + if not missing: + return lines + added: list[str] = [] + for token in missing: + token_lower = token.lower() + for line in summary_lines: + if token_lower in line.lower() and line not in lines and line not in added: + added.append(line) + break + if len(added) >= max_add: + break + if not added: + return lines + return _merge_fact_lines(added, lines) + + +def _best_keyword_line(lines: list[str], keywords: list[str]) -> str | None: + if not lines or not keywords: + return None + tokens = _expand_tokens(keywords) + if not tokens: + return None + best = None + best_score = 0 + for line in lines: + lower = line.lower() + score = sum(1 for tok in tokens if tok in lower) + if score > best_score: + best_score = score + best = line + return best if best_score > 0 else None + + +def _line_starting_with(lines: list[str], prefix: str) -> str | None: + if not lines or not prefix: + return None + lower_prefix = prefix.lower() + for line in lines: + if str(line).lower().startswith(lower_prefix): + return line + return None + + +def _non_rpi_nodes(summary: dict[str, Any]) -> dict[str, list[str]]: + hardware = summary.get("hardware_by_node") if isinstance(summary, dict) else None + if not isinstance(hardware, dict): + return {} + grouped: dict[str, list[str]] = {} + for node, hw in hardware.items(): + if not isinstance(node, str) or not isinstance(hw, str): + continue + if hw.startswith("rpi"): + continue + grouped.setdefault(hw, []).append(node) + for nodes in grouped.values(): + nodes.sort() + return grouped + + +def _format_hardware_groups(groups: dict[str, list[str]], label: str) -> str: + if not groups: + return "" + parts = [] + for hw, nodes in sorted(groups.items()): + parts.append(f"{hw} ({', '.join(nodes)})") + return f"{label}: " + "; ".join(parts) + "." + + +def _lexicon_context(summary: dict[str, Any]) -> str: # noqa: C901 + if not isinstance(summary, dict): + return "" + lexicon = summary.get("lexicon") + if not isinstance(lexicon, dict): + return "" + terms = lexicon.get("terms") + aliases = lexicon.get("aliases") + lines: list[str] = [] + if isinstance(terms, list): + for entry in terms[:8]: + if not isinstance(entry, dict): + continue + term = entry.get("term") + meaning = entry.get("meaning") + if term and meaning: + lines.append(f"{term}: {meaning}") + if isinstance(aliases, dict): + for key, value in list(aliases.items())[:6]: + if key and value: + lines.append(f"alias {key} -> {value}") + if not lines: + return "" + return "Lexicon:\n" + "\n".join(lines) + + +def _parse_json_block(text: str, *, fallback: dict[str, Any]) -> dict[str, Any]: + raw = text.strip() + match = re.search(r"\{.*\}", raw, flags=re.S) + if match: + return parse_json(match.group(0), fallback=fallback) + return parse_json(raw, fallback=fallback) + + +def _parse_json_list(text: str) -> list[dict[str, Any]]: + raw = text.strip() + match = re.search(r"\[.*\]", raw, flags=re.S) + data = parse_json(match.group(0), fallback={}) if match else parse_json(raw, fallback={}) + if isinstance(data, list): + return [entry for entry in data if isinstance(entry, dict)] + return [] + + +def _scores_from_json(data: dict[str, Any]) -> AnswerScores: + return AnswerScores( + confidence=_coerce_int(data.get("confidence"), 60), + relevance=_coerce_int(data.get("relevance"), 60), + satisfaction=_coerce_int(data.get("satisfaction"), 60), + hallucination_risk=str(data.get("hallucination_risk") or "medium"), + ) + + +def _coerce_int(value: Any, default: int) -> int: + try: + return int(float(value)) + except (TypeError, ValueError): + return default + + +def _default_scores() -> AnswerScores: + return AnswerScores(confidence=60, relevance=60, satisfaction=60, hallucination_risk="medium") + + +def _style_hint(classify: dict[str, Any]) -> str: + style = (classify.get("answer_style") or "").strip().lower() + qtype = (classify.get("question_type") or "").strip().lower() + if style == "insightful" or qtype in {"open_ended", "planning"}: + return "insightful" + return "direct" + + +def _needs_evidence_fix(reply: str, classify: dict[str, Any]) -> bool: + if not reply: + return False + lowered = reply.lower() + missing_markers = ( + "don't have", + "do not have", + "don't know", + "cannot", + "can't", + "need to", + "would need", + "does not provide", + "does not mention", + "not mention", + "not provided", + "not in context", + "not referenced", + "missing", + "no specific", + "no information", + ) + if classify.get("needs_snapshot") and any(marker in lowered for marker in missing_markers): + return True + return classify.get("question_type") in {"metric", "diagnostic"} and not re.search(r"\d", reply) + + +def _should_use_insight_guard(classify: dict[str, Any]) -> bool: + style = (classify.get("answer_style") or "").strip().lower() + qtype = (classify.get("question_type") or "").strip().lower() + return style == "insightful" or qtype in {"open_ended", "planning"} + + +async def _apply_insight_guard(inputs: InsightGuardInput) -> str: + if not inputs.reply or not _should_use_insight_guard(inputs.classify): + return inputs.reply + guard_prompt = prompts.INSIGHT_GUARD_PROMPT.format(question=inputs.question, answer=inputs.reply) + guard_raw = await inputs.call_llm( + prompts.INSIGHT_GUARD_SYSTEM, + guard_prompt, + context=inputs.context, + model=inputs.plan.fast_model, + tag="insight_guard", + ) + guard = _parse_json_block(guard_raw, fallback={}) + if guard.get("ok") is True: + return inputs.reply + fix_prompt = prompts.INSIGHT_FIX_PROMPT.format(question=inputs.question, answer=inputs.reply) + if inputs.facts: + fix_prompt = fix_prompt + "\nFacts:\n" + "\n".join(inputs.facts[:6]) + return await inputs.call_llm( + prompts.INSIGHT_FIX_SYSTEM, + fix_prompt, + context=inputs.context, + model=inputs.plan.model, + tag="insight_fix", + ) + + +__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")] diff --git a/atlasbot/engine/answerer/post_ext.py b/atlasbot/engine/answerer/post_ext.py new file mode 100644 index 0000000..65f23d0 --- /dev/null +++ b/atlasbot/engine/answerer/post_ext.py @@ -0,0 +1,276 @@ +from __future__ import annotations + +import difflib +import re +import time +from typing import Any + +from ._base import * + + +def _reply_matches_metric_facts(reply: str, metric_facts: list[str], tokens: list[str] | set[str] | None = None) -> bool: + if not reply or not metric_facts: + return True + reply_numbers = set(re.findall(r"\d+(?:\\.\d+)?", reply)) + if not reply_numbers: + return False + fact_numbers: set[str] = set() + value_pattern = re.compile(r"(?:>=|<=|=|:)\s*(\d+(?:\.\d+)?)") + filtered = metric_facts + if tokens: + token_set = {str(tok).lower() for tok in tokens if tok} + focused = [] + for line in metric_facts: + key = line.split(":", 1)[0].lower() + if any(tok in key for tok in token_set): + focused.append(line) + if focused: + filtered = focused + for line in filtered: + for match in value_pattern.findall(line): + fact_numbers.add(match) + if not fact_numbers: + return False + return bool(reply_numbers & fact_numbers) + + +def _needs_dedup(reply: str) -> bool: + if not reply: + return False + sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", reply) if s.strip()] + if len(sentences) < DEDUP_MIN_SENTENCES: + return False + seen = set() + for sent in sentences: + norm = re.sub(r"\s+", " ", sent.lower()) + if norm in seen: + return True + seen.add(norm) + return False + + +def _needs_focus_fix(question: str, reply: str, classify: dict[str, Any]) -> bool: + if not reply: + return False + q_lower = (question or "").lower() + if classify.get("question_type") not in {"metric", "diagnostic"} and not re.search(r"\b(how many|list|count)\b", q_lower): + return False + missing_markers = ( + "does not provide", + "does not specify", + "not available", + "not provided", + "cannot determine", + "don't have", + "do not have", + "insufficient", + "no data", + ) + if any(marker in reply.lower() for marker in missing_markers): + return True + if reply.count(".") <= 1: + return False + extra_markers = ("for more", "if you need", "additional", "based on") + return any(marker in reply.lower() for marker in extra_markers) + + +def _extract_keywords(raw_question: str, normalized: str, sub_questions: list[str], keywords: list[Any] | None) -> list[str]: + stopwords = { + "the", + "and", + "for", + "with", + "that", + "this", + "what", + "which", + "when", + "where", + "who", + "why", + "how", + "tell", + "show", + "list", + "give", + "about", + "right", + "now", + } + tokens: list[str] = [] + for source in [raw_question, normalized, *sub_questions]: + for part in re.split(r"[^a-zA-Z0-9_-]+", source.lower()): + if len(part) < TOKEN_MIN_LEN or part in stopwords: + continue + tokens.append(part) + if keywords: + for kw in keywords: + if isinstance(kw, str): + part = kw.strip().lower() + if part and part not in stopwords and part not in tokens: + tokens.append(part) + return list(dict.fromkeys(tokens))[:12] + + +def _allowed_nodes(summary: dict[str, Any]) -> list[str]: + hardware = summary.get("hardware_by_node") if isinstance(summary.get("hardware_by_node"), dict) else {} + if hardware: + return sorted([node for node in hardware if isinstance(node, str)]) + return [] + + +def _allowed_namespaces(summary: dict[str, Any]) -> list[str]: + namespaces: list[str] = [] + for entry in summary.get("namespace_pods") or []: + if isinstance(entry, dict): + name = entry.get("namespace") + if name: + namespaces.append(str(name)) + return sorted(set(namespaces)) + + +def _find_unknown_nodes(reply: str, allowed: list[str]) -> list[str]: + if not reply or not allowed: + return [] + pattern = re.compile(r"\b(titan-[0-9a-z]+|node-?\d+)\b", re.IGNORECASE) + found = {m.group(1) for m in pattern.finditer(reply)} + if not found: + return [] + allowed_set = {a.lower() for a in allowed} + return sorted({item for item in found if item.lower() not in allowed_set}) + + +def _find_unknown_namespaces(reply: str, allowed: list[str]) -> list[str]: + if not reply or not allowed: + return [] + pattern = re.compile(r"\bnamespace\s+([a-z0-9-]+)\b", re.IGNORECASE) + found = {m.group(1) for m in pattern.finditer(reply)} + if not found: + return [] + allowed_set = {a.lower() for a in allowed} + return sorted({item for item in found if item.lower() not in allowed_set}) + + +def _needs_runbook_fix(reply: str, allowed: list[str]) -> bool: + if not reply or not allowed: + return False + paths = set(re.findall(r"runbooks/[A-Za-z0-9._-]+", reply)) + if not paths: + return False + allowed_set = {p.lower() for p in allowed} + return any(path.lower() not in allowed_set for path in paths) + + +def _needs_runbook_reference(question: str, allowed: list[str], reply: str) -> bool: + if not allowed or not question: + return False + lowered = question.lower() + cues = ("runbook", "checklist", "documented", "documentation", "where", "guide") + if not any(cue in lowered for cue in cues): + return False + if not reply: + return True + for token in re.findall(r"runbooks/[A-Za-z0-9._-]+", reply): + if token.lower() in {p.lower() for p in allowed}: + return False + return True + + +def _best_runbook_match(candidate: str, allowed: list[str]) -> str | None: + if not candidate or not allowed: + return None + best = None + best_score = 0.0 + for path in allowed: + score = difflib.SequenceMatcher(a=candidate.lower(), b=path.lower()).ratio() + if score > best_score: + best_score = score + best = path + return best if best_score >= RUNBOOK_SIMILARITY_THRESHOLD else None + + +def _resolve_path(data: Any, path: str) -> Any | None: + if path.startswith("line:"): + return path.split("line:", 1)[1].strip() + cursor = data + for part in re.split(r"\.(?![^\[]*\])", path): + if not part: + continue + match = re.match(r"^(\w+)(?:\[(\d+)\])?$", part) + if not match: + return None + key = match.group(1) + index = match.group(2) + if isinstance(cursor, dict): + cursor = cursor.get(key) + else: + return None + if index is not None: + idx = int(index) + if isinstance(cursor, list) and 0 <= idx < len(cursor): + cursor = cursor[idx] + else: + return None + return cursor + + +def _snapshot_id(summary: dict[str, Any]) -> str | None: + if not summary: + return None + for key in ("generated_at", "snapshot_ts", "snapshot_id"): + value = summary.get(key) + if isinstance(value, str) and value: + return value + return None + + +def _claims_to_payload(claims: list[ClaimItem]) -> list[dict[str, Any]]: + output: list[dict[str, Any]] = [] + for claim in claims: + evidence = [] + for ev in claim.evidence: + evidence.append( + { + "path": ev.path, + "reason": ev.reason, + "value_at_claim": ev.value_at_claim, + } + ) + output.append({"id": claim.id, "claim": claim.claim, "evidence": evidence}) + return output + + +def _state_from_payload(payload: dict[str, Any] | None) -> ConversationState | None: + if not payload: + return None + claims_raw = payload.get("claims") if isinstance(payload, dict) else None + claims: list[ClaimItem] = [] + if isinstance(claims_raw, list): + for entry in claims_raw: + if not isinstance(entry, dict): + continue + claim_text = str(entry.get("claim") or "").strip() + claim_id = str(entry.get("id") or "").strip() + if not claim_text or not claim_id: + continue + evidence_items: list[EvidenceItem] = [] + for ev in entry.get("evidence") or []: + if not isinstance(ev, dict): + continue + path = str(ev.get("path") or "").strip() + if not path: + continue + reason = str(ev.get("reason") or "").strip() + value_at_claim = ev.get("value_at_claim") + evidence_items.append(EvidenceItem(path=path, reason=reason, value_at_claim=value_at_claim)) + if evidence_items: + claims.append(ClaimItem(id=claim_id, claim=claim_text, evidence=evidence_items)) + return ConversationState( + updated_at=float(payload.get("updated_at") or time.monotonic()), + claims=claims, + snapshot_id=payload.get("snapshot_id"), + snapshot=payload.get("snapshot"), + ) + + +__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")] diff --git a/atlasbot/engine/answerer/retrieval.py b/atlasbot/engine/answerer/retrieval.py new file mode 100644 index 0000000..1f01911 --- /dev/null +++ b/atlasbot/engine/answerer/retrieval.py @@ -0,0 +1,344 @@ +from __future__ import annotations + +import asyncio +import json +import re +from collections.abc import Awaitable +from collections.abc import Callable +from typing import Any + +from atlasbot.llm import prompts +from atlasbot.llm.client import parse_json + +from ._base import * +from .post_ext import _extract_keywords + + +def _parse_json_block(text: str, *, fallback: dict[str, Any]) -> dict[str, Any]: + raw = text.strip() + match = re.search(r"\{.*\}", raw, flags=re.S) + if match: + return parse_json(match.group(0), fallback=fallback) + return parse_json(raw, fallback=fallback) + + +async def _select_metric_chunks( + call_llm: Callable[..., Awaitable[str]], + ctx: dict[str, Any], + chunks: list[dict[str, Any]], + plan: ModePlan, +) -> tuple[list[str], list[str]]: + summary_lines, question, sub_questions, keywords, token_set = _metric_ctx_values(ctx) + if not summary_lines or not chunks: + return [], [] + keys = _extract_metric_keys(summary_lines) + if not keys: + return [], [] + max_keys = max(4, plan.max_subquestions * 2) + candidate_keys = _filter_metric_keys(keys, token_set) + available_keys = candidate_keys or keys + prompt = prompts.METRIC_KEYS_PROMPT.format(available="\n".join(available_keys), max_keys=max_keys) + raw = await call_llm( + prompts.METRIC_KEYS_SYSTEM, + prompt + "\nQuestion: " + str(question) + "\nSubQuestions:\n" + "\n".join([str(item) for item in sub_questions]), + context="Keywords:\n" + ", ".join([str(item) for item in keywords if item]), + model=plan.fast_model, + tag="metric_keys", + ) + selected = _parse_key_list(raw, available_keys, max_keys) + if candidate_keys: + selected = _merge_metric_keys(selected, candidate_keys, max_keys) + if selected and candidate_keys and not _metric_key_overlap(selected, token_set): + selected = candidate_keys[:max_keys] + if not selected and candidate_keys: + selected = candidate_keys[:max_keys] + if available_keys: + missing = await _validate_metric_keys( + call_llm, + { + "question": question, + "sub_questions": sub_questions, + "selected": selected, + }, + available_keys, + plan, + ) + if missing: + selected = _merge_metric_keys(selected, missing, max_keys) + if not selected: + return [], [] + ids = _chunk_ids_for_keys(chunks, selected) + return selected, ids + + +async def _validate_metric_keys( + call_llm: Callable[..., Awaitable[str]], + ctx: dict[str, Any], + available: list[str], + plan: ModePlan, +) -> list[str]: + if not available: + return [] + question = str(ctx.get("question") or "") + sub_questions = ctx.get("sub_questions") if isinstance(ctx.get("sub_questions"), list) else [] + selected = ctx.get("selected") if isinstance(ctx.get("selected"), list) else [] + cap = max(12, plan.max_subquestions * 4) + available_list = available[:cap] + prompt = prompts.METRIC_KEYS_VALIDATE_PROMPT.format( + question=question, + sub_questions=json.dumps(sub_questions), + selected=json.dumps(selected), + available="\n".join(available_list), + ) + raw = await call_llm( + prompts.METRIC_KEYS_VALIDATE_SYSTEM, + prompt, + model=plan.fast_model, + tag="metric_keys_validate", + ) + parsed = _parse_json_block(raw, fallback={}) + items = parsed.get("missing") if isinstance(parsed, dict) else [] + if not isinstance(items, list): + return [] + available_set = set(available_list) + out: list[str] = [] + for item in items: + if isinstance(item, str) and item in available_set and item not in out: + out.append(item) + return out + + +async def _gather_limited(coros: list[Awaitable[Any]], limit: int) -> list[Any]: + if not coros: + return [] + semaphore = asyncio.Semaphore(max(1, limit)) + + async def runner(coro: Awaitable[Any]) -> Any: + async with semaphore: + return await coro + + return await asyncio.gather(*(runner(coro) for coro in coros)) + + +def _metric_ctx_values(ctx: dict[str, Any]) -> tuple[list[str], str, list[str], list[str], set[str]]: + summary_lines = ctx.get("summary_lines") if isinstance(ctx, dict) else None + if not isinstance(summary_lines, list): + return [], "", [], [], set() + question = ctx.get("question") if isinstance(ctx, dict) else "" + sub_questions = ctx.get("sub_questions") if isinstance(ctx.get("sub_questions"), list) else [] + keywords = ctx.get("keywords") if isinstance(ctx.get("keywords"), list) else [] + keyword_tokens = ctx.get("keyword_tokens") if isinstance(ctx.get("keyword_tokens"), list) else [] + token_set = {str(token).lower() for token in keyword_tokens if token} + token_set |= {token.lower() for token in _extract_keywords(str(question), str(question), sub_questions=sub_questions, keywords=keywords)} + token_set = _token_variants(token_set) + return summary_lines, str(question), sub_questions, keywords, token_set + + +def _extract_metric_keys(lines: list[str]) -> list[str]: + keys: list[str] = [] + for line in lines: + if ":" not in line: + continue + key = line.split(":", 1)[0].strip() + if not key or " " in key: + continue + if key not in keys: + keys.append(key) + return keys + + +def _token_variants(tokens: set[str]) -> set[str]: + if not tokens: + return set() + variants = set(tokens) + for token in list(tokens): + if len(token) <= TOKEN_MIN_LEN: + continue + if token.endswith("ies") and len(token) > TOKEN_MIN_LEN: + variants.add(token[:-3] + "y") + if token.endswith("es") and len(token) > TOKEN_MIN_LEN: + variants.add(token[:-2]) + if token.endswith("s") and len(token) > TOKEN_MIN_LEN: + variants.add(token[:-1]) + return variants + + +def _parse_key_list(raw: str, allowed: list[str], max_keys: int) -> list[str]: + parsed = _parse_json_block(raw, fallback={}) + if isinstance(parsed, list): + items = parsed + else: + items = parsed.get("keys") if isinstance(parsed, dict) else [] + if not isinstance(items, list): + return [] + allowed_set = set(allowed) + out: list[str] = [] + for item in items: + if not isinstance(item, str): + continue + if item in allowed_set and item not in out: + out.append(item) + if len(out) >= max_keys: + break + return out + + +def _chunk_ids_for_keys(chunks: list[dict[str, Any]], keys: list[str]) -> list[str]: + if not keys: + return [] + ids: list[str] = [] + key_set = {f"{key}:" for key in keys} + for chunk in chunks: + text = str(chunk.get("text") or "") + if not text: + continue + for line in text.splitlines(): + for key in key_set: + if line.startswith(key): + cid = chunk.get("id") + if cid and cid not in ids: + ids.append(cid) + break + return ids + + +def _filter_metric_keys(keys: list[str], tokens: set[str]) -> list[str]: + if not keys or not tokens: + return [] + lowered_tokens = {token.lower() for token in tokens if token and len(token) >= TOKEN_MIN_LEN} + ranked: list[tuple[int, str]] = [] + for key in keys: + parts = [part for part in re.split(r"[_\W]+", key.lower()) if part] + if not parts: + continue + hits = len(set(parts) & lowered_tokens) + if hits: + ranked.append((hits, key)) + ranked.sort(key=lambda item: (-item[0], item[1])) + return [item[1] for item in ranked] + + +def _metric_key_overlap(keys: list[str], tokens: set[str]) -> bool: + if not keys or not tokens: + return False + lowered_tokens = {token.lower() for token in tokens if token and len(token) >= TOKEN_MIN_LEN} + for key in keys: + parts = [part for part in re.split(r"[_\W]+", key.lower()) if part] + if set(parts) & lowered_tokens: + return True + return False + + +def _lines_for_metric_keys(lines: list[str], keys: list[str], max_lines: int = 0) -> list[str]: + if not lines or not keys: + return [] + prefixes = {f"{key}:" for key in keys} + selected: list[str] = [] + for line in lines: + for prefix in prefixes: + if prefix in line: + selected.append(line) + break + if max_lines and len(selected) >= max_lines: + break + return selected + + +def _merge_metric_keys(current: list[str], candidates: list[str], max_keys: int) -> list[str]: + merged: list[str] = [] + seen = set() + for key in current: + if key and key not in seen: + merged.append(key) + seen.add(key) + for key in candidates: + if key and key not in seen: + merged.append(key) + seen.add(key) + if len(merged) >= max_keys: + break + return merged[:max_keys] + + +def _merge_fact_lines(primary: list[str], fallback: list[str]) -> list[str]: + seen = set() + merged: list[str] = [] + for line in primary + fallback: + if line in seen: + continue + seen.add(line) + merged.append(line) + return merged + + +def _expand_hottest_line(line: str) -> list[str]: + if not line: + return [] + if not line.lower().startswith("hottest:"): + return [] + expanded: list[str] = [] + payload = line.split("hottest:", 1)[1] + for part in payload.split(";"): + part = part.strip() + if not part or "=" not in part: + continue + metric, rest = part.split("=", 1) + metric = metric.strip() + match = re.search(r"(?P[^\s\[]+).*\((?P[^)]+)\)", rest) + if not match: + continue + node = match.group("node").strip() + value = match.group("value").strip() + class_match = re.search(r"\[(?P[^\]]+)\]", rest) + node_class = class_match.group("class").strip() if class_match else "" + if node_class: + expanded.append(f"hottest_{metric}_node: {node} [{node_class}] ({value})") + else: + expanded.append(f"hottest_{metric}_node: {node} ({value})") + return expanded + + +def _has_token(text: str, token: str) -> bool: + if not text or not token: + return False + if token == "io": + return "i/o" in text or re.search(r"\bio\b", text) is not None + return re.search(rf"\b{re.escape(token)}\b", text) is not None + + +def _hotspot_evidence(summary: dict[str, Any]) -> list[str]: + hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {} + if not hottest: + return [] + hardware_by_node = summary.get("hardware_by_node") if isinstance(summary.get("hardware_by_node"), dict) else {} + node_pods_top = summary.get("node_pods_top") if isinstance(summary.get("node_pods_top"), list) else [] + ns_map = {} + for item in node_pods_top: + if not isinstance(item, dict): + continue + node = item.get("node") + namespaces_top = item.get("namespaces_top") if isinstance(item.get("namespaces_top"), list) else [] + ns_map[node] = namespaces_top + lines: list[str] = [] + for metric, info in hottest.items(): + if not isinstance(info, dict): + continue + node = info.get("node") + value = info.get("value") + if not node: + continue + node_class = hardware_by_node.get(node) + ns_parts = [] + for entry in ns_map.get(node, [])[:3]: + if isinstance(entry, (list, tuple)) and len(entry) >= NS_ENTRY_MIN_LEN: + ns_parts.append(f"{entry[0]}={entry[1]}") + ns_text = ", ".join(ns_parts) + value_text = f"{value:.2f}" if isinstance(value, (int, float)) else str(value) + line = f"hotspot.{metric}: node={node} class={node_class or 'unknown'} value={value_text}" + if ns_text: + line += f" namespaces_top={ns_text}" + lines.append(line) + return lines + + +__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")] diff --git a/atlasbot/engine/answerer/retrieval_ext.py b/atlasbot/engine/answerer/retrieval_ext.py new file mode 100644 index 0000000..2b02639 --- /dev/null +++ b/atlasbot/engine/answerer/retrieval_ext.py @@ -0,0 +1,197 @@ +from __future__ import annotations + +import re +from collections.abc import Callable +from typing import Any + +from atlasbot.llm import prompts +from atlasbot.llm.client import parse_json +from ._base import * + + +def _parse_json_block(text: str, *, fallback: dict[str, Any]) -> dict[str, Any]: + raw = text.strip() + match = re.search(r"\{.*\}", raw, flags=re.S) + if match: + return parse_json(match.group(0), fallback=fallback) + return parse_json(raw, fallback=fallback) + + +def _metric_key_tokens(summary_lines: list[str]) -> set[str]: + tokens: set[str] = set() + for line in summary_lines: + if not isinstance(line, str) or ":" not in line: + continue + key = line.split(":", 1)[0].strip().lower() + if not key: + continue + tokens.add(key) + for part in re.split(r"[_\s]+", key): + if part: + tokens.add(part) + return tokens + + +async def _select_best_candidate(call_llm: Callable[..., Any], question: str, candidates: list[str], plan: ModePlan, tag: str) -> int: + if len(candidates) <= 1: + return 0 + prompt = ( + prompts.CANDIDATE_SELECT_PROMPT + + "\nQuestion: " + + question + + "\nCandidates:\n" + + "\n".join([f"{idx+1}) {cand}" for idx, cand in enumerate(candidates)]) + ) + raw = await call_llm(prompts.CANDIDATE_SELECT_SYSTEM, prompt, model=plan.model, tag=tag) + data = _parse_json_block(raw, fallback={}) + best = data.get("best") if isinstance(data, dict) else None + if isinstance(best, int) and 1 <= best <= len(candidates): + return best - 1 + return 0 + + +def _dedupe_lines(lines: list[str], limit: int | None = None) -> list[str]: + seen: set[str] = set() + cleaned: list[str] = [] + for line in lines: + value = (line or "").strip() + if not value or value in seen: + continue + if value.lower().startswith("lexicon_") or value.lower().startswith("units:"): + continue + cleaned.append(value) + seen.add(value) + if limit and len(cleaned) >= limit: + break + return cleaned + + +def _collect_fact_candidates(selected: list[dict[str, Any]], limit: int) -> list[str]: + lines: list[str] = [] + for chunk in selected: + text = chunk.get("text") if isinstance(chunk, dict) else None + if not isinstance(text, str): + continue + lines.extend([line for line in text.splitlines() if line.strip()]) + return _dedupe_lines(lines, limit=limit) + + +async def _select_best_list(call_llm: Callable[..., Any], question: str, candidates: list[list[str]], plan: ModePlan, tag: str) -> list[str]: + if not candidates: + return [] + if len(candidates) == 1: + return candidates[0] + render = ["; ".join(items) for items in candidates] + best_idx = await _select_best_candidate(call_llm, question, render, plan, tag) + chosen = candidates[best_idx] if 0 <= best_idx < len(candidates) else candidates[0] + if not chosen: + merged: list[str] = [] + for entry in candidates: + for item in entry: + if item not in merged: + merged.append(item) + chosen = merged + return chosen + + +async def _extract_fact_types(call_llm: Callable[..., Any], question: str, keywords: list[str], plan: ModePlan) -> list[str]: + prompt = prompts.FACT_TYPES_PROMPT + "\nQuestion: " + question + if keywords: + prompt += "\nKeywords: " + ", ".join(keywords) + candidates: list[list[str]] = [] + attempts = max(plan.metric_retries, 1) + for _ in range(attempts): + raw = await call_llm(prompts.FACT_TYPES_SYSTEM, prompt, model=plan.fast_model, tag="fact_types") + data = _parse_json_block(raw, fallback={}) + items = data.get("fact_types") if isinstance(data, dict) else None + if not isinstance(items, list): + continue + cleaned = _dedupe_lines([str(item) for item in items if isinstance(item, (str, int, float))], limit=10) + if cleaned: + candidates.append(cleaned) + chosen = await _select_best_list(call_llm, question, candidates, plan, "fact_types_select") + return chosen[:10] + + +async def _derive_signals(call_llm: Callable[..., Any], question: str, fact_types: list[str], plan: ModePlan) -> list[str]: + if not fact_types: + return [] + prompt = prompts.SIGNAL_PROMPT.format(question=question, fact_types="; ".join(fact_types)) + candidates: list[list[str]] = [] + attempts = max(plan.metric_retries, 1) + for _ in range(attempts): + raw = await call_llm(prompts.SIGNAL_SYSTEM, prompt, model=plan.fast_model, tag="signals") + data = _parse_json_block(raw, fallback={}) + items = data.get("signals") if isinstance(data, dict) else None + if not isinstance(items, list): + continue + cleaned = _dedupe_lines([str(item) for item in items if isinstance(item, (str, int, float))], limit=12) + if cleaned: + candidates.append(cleaned) + chosen = await _select_best_list(call_llm, question, candidates, plan, "signals_select") + return chosen[:12] + + +async def _scan_chunk_for_signals(call_llm: Callable[..., Any], question: str, signals: list[str], chunk_lines: list[str], plan: ModePlan) -> list[str]: + if not signals or not chunk_lines: + return [] + prompt = prompts.CHUNK_SCAN_PROMPT.format( + signals="; ".join(signals), + lines="\n".join(chunk_lines), + ) + attempts = max(1, min(plan.metric_retries, 2)) + candidates: list[list[str]] = [] + for _ in range(attempts): + raw = await call_llm(prompts.CHUNK_SCAN_SYSTEM, prompt, model=plan.fast_model, tag="chunk_scan") + data = _parse_json_block(raw, fallback={}) + items = data.get("lines") if isinstance(data, dict) else None + if not isinstance(items, list): + continue + cleaned = [line for line in chunk_lines if line in items] + cleaned = _dedupe_lines(cleaned, limit=15) + if cleaned: + candidates.append(cleaned) + chosen = await _select_best_list(call_llm, question, candidates, plan, "chunk_scan_select") + return chosen[:15] + + +async def _prune_metric_candidates(call_llm: Callable[..., Any], question: str, candidates: list[str], plan: ModePlan, attempts: int) -> list[str]: + if not candidates: + return [] + prompt = prompts.FACT_PRUNE_PROMPT.format(question=question, candidates="\n".join(candidates), max_lines=6) + picks: list[list[str]] = [] + for _ in range(max(attempts, 1)): + raw = await call_llm(prompts.FACT_PRUNE_SYSTEM, prompt, model=plan.fast_model, tag="fact_prune") + data = _parse_json_block(raw, fallback={}) + items = data.get("lines") if isinstance(data, dict) else None + if not isinstance(items, list): + continue + cleaned = [line for line in candidates if line in items] + cleaned = _dedupe_lines(cleaned, limit=6) + if cleaned: + picks.append(cleaned) + chosen = await _select_best_list(call_llm, question, picks, plan, "fact_prune_select") + return chosen[:6] + + +async def _select_fact_lines(call_llm: Callable[..., Any], question: str, candidates: list[str], plan: ModePlan, max_lines: int) -> list[str]: + if not candidates: + return [] + prompt = prompts.FACT_PRUNE_PROMPT.format(question=question, candidates="\n".join(candidates), max_lines=max_lines) + picks: list[list[str]] = [] + attempts = max(plan.metric_retries, 1) + for _ in range(attempts): + raw = await call_llm(prompts.FACT_PRUNE_SYSTEM, prompt, model=plan.fast_model, tag="fact_select") + data = _parse_json_block(raw, fallback={}) + items = data.get("lines") if isinstance(data, dict) else None + if not isinstance(items, list): + continue + cleaned = [line for line in candidates if line in items] + cleaned = _dedupe_lines(cleaned, limit=max_lines) + if cleaned: + picks.append(cleaned) + chosen = await _select_best_list(call_llm, question, picks, plan, "fact_select_best") + return chosen[:max_lines] + + +__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")] diff --git a/atlasbot/engine/answerer/spine.py b/atlasbot/engine/answerer/spine.py new file mode 100644 index 0000000..3f9c80c --- /dev/null +++ b/atlasbot/engine/answerer/spine.py @@ -0,0 +1,404 @@ +from __future__ import annotations + +import re +from typing import Any + +from atlasbot.engine.intent_router import IntentMatch +from atlasbot.snapshot.builder import summary_text + +from ._base import * + + +def _join_context(parts: list[str]) -> str: + text = "\n".join([part for part in parts if part]) + return text.strip() + + +def _format_metric_value(value: Any) -> str: + if isinstance(value, bool): + return str(value).lower() + if isinstance(value, int): + return str(value) + if isinstance(value, float): + return f"{value:.1f}".rstrip("0").rstrip(".") + return str(value) + + +def _format_history(history: list[dict[str, str]] | None) -> str: + if not history: + return "" + lines = ["Recent conversation (non-authoritative):"] + for entry in history[-4:]: + if not isinstance(entry, dict): + continue + question = entry.get("q") + answer = entry.get("a") + role = entry.get("role") + content = entry.get("content") + if question: + lines.append(f"Q: {question}") + if answer: + lines.append(f"A: {answer}") + if role and content: + prefix = "Q" if role == "user" else "A" + lines.append(f"{prefix}: {content}") + return "\n".join(lines) + + +def _summary_lines(snapshot: dict[str, Any] | None) -> list[str]: + text = summary_text(snapshot) + if not text: + return [] + return [line for line in text.splitlines() if line.strip()] + + +def _line_starting_with(lines: list[str], prefix: str) -> str | None: + if not lines: + return None + for line in lines: + if line.lower().startswith(prefix.lower()): + return line + return None + + +def _spine_lines(lines: list[str]) -> dict[str, str]: + spine: dict[str, str] = {} + _spine_nodes(lines, spine) + _spine_hardware(lines, spine) + _spine_hottest(lines, spine) + _spine_postgres(lines, spine) + _spine_namespaces(lines, spine) + _spine_pressure(lines, spine) + return spine + + +def _spine_nodes(lines: list[str], spine: dict[str, str]) -> None: + nodes_line = _line_starting_with(lines, "nodes:") + if nodes_line: + spine["nodes_count"] = nodes_line + spine["nodes_ready"] = nodes_line + return + nodes_total = _line_starting_with(lines, "nodes_total:") + nodes_ready = _line_starting_with(lines, "nodes_ready:") + if nodes_total: + spine["nodes_count"] = nodes_total + if nodes_ready: + spine["nodes_ready"] = nodes_ready + + +def _spine_hardware(lines: list[str], spine: dict[str, str]) -> None: + hardware_line = _line_starting_with(lines, "hardware_nodes:") + if not hardware_line: + hardware_line = _line_starting_with(lines, "hardware:") + if hardware_line: + spine["nodes_non_rpi"] = hardware_line + + +def _spine_hottest(lines: list[str], spine: dict[str, str]) -> None: + hottest_line = _line_starting_with(lines, "hottest:") + if not hottest_line: + return + for key in ("hottest_cpu", "hottest_ram", "hottest_net", "hottest_io", "hottest_disk"): + spine[key] = hottest_line + + +def _spine_postgres(lines: list[str], spine: dict[str, str]) -> None: + postgres_total = _line_starting_with(lines, "postgres_connections_total:") + if postgres_total: + spine["postgres_connections"] = postgres_total + postgres_line = _line_starting_with(lines, "postgres:") + if postgres_line: + spine["postgres_hottest"] = postgres_line + + +def _spine_namespaces(lines: list[str], spine: dict[str, str]) -> None: + namespaces_top = _line_starting_with(lines, "namespaces_top:") + if namespaces_top: + spine["namespace_most_pods"] = namespaces_top + + +def _spine_pressure(lines: list[str], spine: dict[str, str]) -> None: + pressure_line = _line_starting_with(lines, "pressure_nodes:") + if pressure_line: + spine["pressure_summary"] = pressure_line + return + load_line = _line_starting_with(lines, "node_load_top:") + if load_line: + spine["pressure_summary"] = load_line + + +def _parse_group_line(line: str) -> dict[str, list[str]]: + groups: dict[str, list[str]] = {} + if not line: + return groups + payload = line.split(":", 1)[1] if ":" in line else line + for part in payload.split(";"): + part = part.strip() + if not part or "=" not in part: + continue + key, value = part.split("=", 1) + value = value.strip() + nodes: list[str] = [] + if "(" in value and ")" in value: + inner = value[value.find("(") + 1 : value.rfind(")")] + nodes = [item.strip() for item in inner.split(",") if item.strip()] + if not nodes: + cleaned = re.sub(r"^[0-9]+", "", value).strip() + nodes = [item.strip() for item in cleaned.split(",") if item.strip()] + groups[key.strip()] = nodes + return groups + + +def _parse_hottest(line: str, metric: str) -> str | None: + if not line: + return None + payload = line.split(":", 1)[1] if ":" in line else line + for part in payload.split(";"): + part = part.strip() + if part.startswith(f"{metric}="): + return part + return None + + +def _spine_answer(intent: IntentMatch, spine_line: str | None) -> str | None: + if not spine_line: + return None + handlers = { + "nodes_count": _spine_nodes_answer, + "nodes_ready": _spine_nodes_answer, + "nodes_non_rpi": _spine_non_rpi_answer, + "hardware_mix": _spine_hardware_answer, + "postgres_connections": _spine_postgres_answer, + "postgres_hottest": _spine_postgres_answer, + "namespace_most_pods": _spine_namespace_answer, + "pressure_summary": _spine_pressure_answer, + } + kind = intent.kind + if kind.startswith("hottest_"): + return _spine_hottest_answer(kind, spine_line) + handler = handlers.get(kind) + if handler: + return handler(spine_line) + return spine_line + + +def _spine_nodes_answer(line: str) -> str: + return line + + +def _spine_non_rpi_answer(line: str) -> str: + groups = _parse_group_line(line) + non_rpi: list[str] = [] + for key, nodes in groups.items(): + if key.lower().startswith("rpi"): + continue + non_rpi.extend(nodes) + if non_rpi: + return "Non-Raspberry Pi nodes: " + ", ".join(non_rpi) + "." + return line + + +def _spine_hardware_answer(line: str) -> str: + return line + + +def _spine_hottest_answer(kind: str, line: str) -> str: + metric = kind.split("_", 1)[1] + hottest = _parse_hottest(line, metric) + if hottest: + return hottest + return line + + +def _spine_postgres_answer(line: str) -> str: + return line + + +def _spine_namespace_answer(line: str) -> str: + payload = line.split(":", 1)[1] if ":" in line else line + top = payload.split(";")[0].strip() + if top: + return f"Namespace with most pods: {top}." + return line + + +def _spine_pressure_answer(line: str) -> str: + return line + + +def _spine_from_summary(summary: dict[str, Any]) -> dict[str, str]: + if not isinstance(summary, dict) or not summary: + return {} + spine: dict[str, str] = {} + spine.update(_spine_from_counts(summary)) + spine.update(_spine_from_hardware(summary)) + spine.update(_spine_from_hottest(summary)) + spine.update(_spine_from_postgres(summary)) + spine.update(_spine_from_namespace_pods(summary)) + spine.update(_spine_from_pressure(summary)) + return spine + + +def _spine_from_counts(summary: dict[str, Any]) -> dict[str, str]: + counts = summary.get("counts") if isinstance(summary.get("counts"), dict) else {} + inventory = summary.get("inventory") if isinstance(summary.get("inventory"), dict) else {} + nodes = summary.get("nodes") if isinstance(summary.get("nodes"), dict) else {} + workers = inventory.get("workers") if isinstance(inventory.get("workers"), dict) else {} + total = nodes.get("total") + ready = nodes.get("ready") + not_ready = nodes.get("not_ready") + if total is None: + total = counts.get("nodes_total") + if ready is None: + ready = counts.get("nodes_ready") + if not_ready is None and isinstance(inventory.get("not_ready_names"), list): + not_ready = len(inventory.get("not_ready_names") or []) + workers_ready = workers.get("ready") + workers_total = workers.get("total") + if total is None and ready is None and not_ready is None: + return {} + parts = [] + if total is not None: + parts.append(f"total={int(total)}") + if ready is not None: + parts.append(f"ready={int(ready)}") + if not_ready is not None: + parts.append(f"not_ready={int(not_ready)}") + if workers_total is not None and workers_ready is not None: + parts.append(f"workers_ready={int(workers_ready)}/{int(workers_total)}") + line = "nodes: " + ", ".join(parts) + return {"nodes_count": line, "nodes_ready": line} + + +def _spine_from_hardware(summary: dict[str, Any]) -> dict[str, str]: + hardware = summary.get("hardware") if isinstance(summary.get("hardware"), dict) else {} + if not hardware: + return {} + parts = [] + for key, nodes in hardware.items(): + if not isinstance(nodes, list): + continue + node_list = ", ".join(str(n) for n in nodes if n) + if node_list: + parts.append(f"{key}=({node_list})") + if not parts: + return {} + return {"nodes_non_rpi": "hardware: " + "; ".join(parts)} + + +def _spine_from_hottest(summary: dict[str, Any]) -> dict[str, str]: + hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {} + top = summary.get("top") if isinstance(summary.get("top"), dict) else {} + top_hottest = top.get("node_hottest") if isinstance(top.get("node_hottest"), dict) else {} + if not hottest and top_hottest: + hottest = top_hottest + elif top_hottest: + for key, value in top_hottest.items(): + if key not in hottest and value is not None: + hottest[key] = value + if not hottest: + return {} + mapping = {} + for key in ("cpu", "ram", "net", "io", "disk"): + entry = hottest.get(key) + if not isinstance(entry, dict): + continue + node = entry.get("node") or entry.get("label") or "" + value = entry.get("value") + if node: + mapping[f"hottest_{key}"] = f"{key}={node} ({_format_metric_value(value)})" + if not mapping: + return {} + return mapping + + +def _spine_from_postgres(summary: dict[str, Any]) -> dict[str, str]: + postgres = summary.get("postgres") if isinstance(summary.get("postgres"), dict) else {} + if not postgres: + top = summary.get("top") if isinstance(summary.get("top"), dict) else {} + postgres = top.get("postgres") if isinstance(top.get("postgres"), dict) else {} + if not postgres: + return {} + used = postgres.get("used") + max_conn = postgres.get("max") + hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} + hottest_label = hottest.get("label") or "" + facts: dict[str, str] = {} + if used is not None and max_conn is not None: + facts["postgres_connections"] = f"postgres_connections_total: used={int(used)}, max={int(max_conn)}" + if hottest_label: + facts["postgres_hottest"] = f"postgres_hottest_db: {hottest_label}" + return facts + + +def _spine_from_namespace_pods(summary: dict[str, Any]) -> dict[str, str]: + pods = summary.get("namespace_pods") if isinstance(summary.get("namespace_pods"), list) else [] + if not pods: + top = summary.get("top") if isinstance(summary.get("top"), dict) else {} + pods = top.get("namespace_pods") if isinstance(top.get("namespace_pods"), list) else [] + if not pods: + return {} + best_name = "" + best_value = None + for entry in pods: + if not isinstance(entry, dict): + continue + name = entry.get("namespace") or entry.get("name") or entry.get("label") or "" + value = entry.get("pods") + if value is None: + value = entry.get("pods_total") + if value is None: + value = entry.get("value") + try: + numeric = float(value) + except (TypeError, ValueError): + numeric = None + if name and numeric is not None and (best_value is None or numeric > best_value): + best_name = name + best_value = numeric + if best_name: + return {"namespace_most_pods": f"namespace_most_pods: {best_name} ({int(best_value or 0)} pods)"} + return {} + + +def _spine_from_pressure(summary: dict[str, Any]) -> dict[str, str]: + pressure = summary.get("pressure_summary") if isinstance(summary.get("pressure_summary"), dict) else {} + if not pressure: + pressure = summary.get("pressure_nodes") if isinstance(summary.get("pressure_nodes"), dict) else {} + if not pressure: + return {} + total = pressure.get("total") + unsched = pressure.get("unschedulable") + names = pressure.get("names") if isinstance(pressure.get("names"), list) else [] + parts = [] + if total is None and names: + total = len([name for name in names if name]) + if total is not None: + parts.append(f"total={int(total)}") + if unsched is not None: + parts.append(f"unschedulable={int(unsched)}") + if parts: + return {"pressure_summary": "pressure_nodes: " + ", ".join(parts)} + return {} + + +def _spine_fallback(intent: IntentMatch, lines: list[str]) -> str | None: + if not lines: + return None + keywords = { + "nodes_count": ("nodes:", "nodes_total:"), + "nodes_ready": ("nodes:", "nodes_ready:"), + "postgres_hottest": ("postgres_hottest", "hottest_db", "postgres"), + "namespace_most_pods": ("namespace", "pods", "namespaces_top"), + "pressure_summary": ("pressure", "node_load_top"), + } + for token in keywords.get(intent.kind, ("",)): + if not token: + continue + for line in lines: + if token in line: + return line + return None + + +__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")] diff --git a/atlasbot/engine/answerer/workflow.py b/atlasbot/engine/answerer/workflow.py new file mode 100644 index 0000000..9e43aa0 --- /dev/null +++ b/atlasbot/engine/answerer/workflow.py @@ -0,0 +1,484 @@ +from __future__ import annotations + +import asyncio +import json +import math +import re +import time +from collections.abc import Callable +from typing import Any + +from atlasbot.engine.intent_router import route_intent +from atlasbot.llm import prompts +from atlasbot.llm.client import build_messages +from atlasbot.snapshot.builder import build_summary + +from ._base import * +from .common import * +from .factsheet import * +from .post import * +from .post_ext import * +from .retrieval import * +from .retrieval_ext import * +from .spine import * +from .workflow_post import finalize_answer + +async def run_answer(engine: Any, question: str, *, mode: str, history: list[dict[str, str]] | None = None, observer: Callable[[str, str], None] | None = None, conversation_id: str | None = None, snapshot_pin: bool | None = None) -> AnswerResult: # noqa: C901 + """Answer a question using the staged reasoning pipeline.""" + + settings = engine._settings + question = (question or "").strip() + if not question: + return AnswerResult("I need a question to answer.", _default_scores(), {"mode": mode}) + if mode == "stock": + return await engine._answer_stock(question) + + limitless = "run limitless" in question.lower() + if limitless: + question = re.sub(r"(?i)run limitless", "", question).strip() + + plan = _mode_plan(settings, mode) + call_limit = _llm_call_limit(settings, mode) + call_cap = math.ceil(call_limit * settings.llm_limit_multiplier) + call_count = 0 + limit_hit = False + time_budget_hit = False + started = time.monotonic() + time_budget_sec = _mode_time_budget(settings, mode) if not limitless else 0.0 + + debug_tags = { + "route", + "decompose", + "chunk_score", + "chunk_select", + "fact_select", + "synth", + "subanswer", + "tool", + "followup", + "select_claims", + "evidence_fix", + } + + async def call_llm(system: str, prompt: str, *, context: str | None = None, model: str | None = None, tag: str = "") -> str: + nonlocal call_count, limit_hit, time_budget_hit + if not limitless and call_count >= call_cap: + limit_hit = True + raise LLMLimitReached("llm_limit") + timeout_sec = None + if not limitless and time_budget_sec > 0: + time_left = time_budget_sec - (time.monotonic() - started) + if time_left <= 0: + time_budget_hit = True + raise LLMTimeBudgetExceeded("time_budget") + timeout_sec = min(settings.ollama_timeout_sec, time_left) + call_count += 1 + messages = build_messages(system, prompt, context=context) + try: + llm_call = engine._llm.chat(messages, model=model or plan.model, timeout_sec=timeout_sec) + if timeout_sec is not None: + response = await asyncio.wait_for(llm_call, timeout=max(0.001, timeout_sec)) + else: + response = await llm_call + except TimeoutError as exc: + time_budget_hit = True + raise LLMTimeBudgetExceeded("time_budget") from exc + log.info( + "atlasbot_llm_call", + extra={"extra": {"mode": mode, "tag": tag, "call": call_count, "limit": call_cap}}, + ) + if settings.debug_pipeline and tag in debug_tags: + _debug_pipeline_log(settings, f"llm_raw_{tag}", str(response)[:1200]) + return response + + state = engine._get_state(conversation_id) + pin_snapshot = bool(snapshot_pin) or settings.snapshot_pin_enabled + snapshot = engine._snapshot.get() + snapshot_used = state.snapshot if pin_snapshot and state and state.snapshot else snapshot + summary = build_summary(snapshot_used) + summary_lines = _summary_lines(snapshot_used) + allowed_nodes = _allowed_nodes(summary) + allowed_namespaces = _allowed_namespaces(summary) + spine = _spine_from_summary(summary) or _spine_lines(summary_lines) + metric_tokens = _metric_key_tokens(summary_lines) + global_facts = _global_facts(summary_lines) + kb_summary = engine._kb.summary() + runbooks = engine._kb.runbook_titles(limit=6) + runbook_paths = engine._kb.runbook_paths(limit=10) + history_ctx = _format_history(history) + lexicon_ctx = _lexicon_context(summary) + + key_facts: list[str] = [] + metric_facts: list[str] = [] + facts_used: list[str] = [] + reply = "" + scores = _default_scores() + claims: list[ClaimItem] = [] + classify: dict[str, Any] = {} + tool_hint: dict[str, Any] | None = None + + try: + if mode in {"quick", "fast", "smart", "genius"} and not limitless: + if observer: + observer("factsheet", "building fact sheet") + if _is_plain_math_question(question): + reply = ( + "I focus on Titan cluster operations. Ask me about cluster health, nodes, workloads, " + "namespaces, storage, or alerts." + ) + return AnswerResult(reply, _default_scores(), _build_meta(mode, call_count, call_cap, limit_hit, time_budget_hit, time_budget_sec, classify, tool_hint, started)) + kb_lines = ( + engine._kb.chunk_lines(max_files=plan.kb_max_files, max_chars=_factsheet_kb_chars(mode, plan.kb_max_chars)) + if engine._kb + else [] + ) + fact_lines = _quick_fact_sheet_lines(question, summary_lines, kb_lines, limit=_factsheet_line_limit(mode)) + classify = { + "needs_snapshot": True, + "needs_kb": bool(kb_lines), + "question_type": f"{mode}_factsheet", + "answer_style": "direct" if mode in {"quick", "fast"} else "concise", + "follow_up": False, + } + heuristic_reply = _quick_fact_sheet_heuristic_answer(question, fact_lines) + if heuristic_reply: + return AnswerResult(heuristic_reply, _default_scores(), _build_meta(mode, call_count, call_cap, limit_hit, time_budget_hit, time_budget_sec, classify, tool_hint, started)) + if observer: + observer("quick", "answering from fact sheet") + quick_context = _quick_fact_sheet_text(fact_lines) + quick_prompt = "Question: " + question + "\nAnswer using only the Fact Sheet. " + _factsheet_instruction(mode) + reply = await call_llm(prompts.ANSWER_SYSTEM, quick_prompt, context=quick_context, model=_factsheet_model(mode, plan), tag=f"{mode}_factsheet") + reply = _strip_followup_meta(reply) + return AnswerResult(reply, _default_scores(), _build_meta(mode, call_count, call_cap, limit_hit, time_budget_hit, time_budget_sec, classify, tool_hint, started)) + + if observer: + observer("normalize", "normalizing") + normalize_prompt = prompts.NORMALIZE_PROMPT + "\nQuestion: " + question + normalize_raw = await call_llm(prompts.NORMALIZE_SYSTEM, normalize_prompt, context=lexicon_ctx, model=plan.fast_model, tag="normalize") + normalize = _parse_json_block(normalize_raw, fallback={"normalized": question, "keywords": []}) + normalized = str(normalize.get("normalized") or question).strip() or question + keywords = normalize.get("keywords") or [] + _debug_pipeline_log(settings, "normalize_parsed", {"normalized": normalized, "keywords": keywords}) + keyword_tokens = _extract_keywords(question, normalized, sub_questions=[], keywords=keywords) + question_tokens = _extract_question_tokens(normalized) + + if observer: + observer("route", "routing") + route_prompt = prompts.ROUTE_PROMPT + "\nQuestion: " + normalized + "\nKeywords: " + json.dumps(keywords) + route_raw = await call_llm(prompts.ROUTE_SYSTEM, route_prompt, context=_join_context([kb_summary, lexicon_ctx]), model=plan.fast_model, tag="route") + classify = _parse_json_block(route_raw, fallback={}) + classify.setdefault("needs_snapshot", True) + classify.setdefault("answer_style", "direct") + classify.setdefault("follow_up", False) + classify.setdefault("focus_entity", "unknown") + classify.setdefault("focus_metric", "unknown") + if metric_tokens and keyword_tokens and any(token in metric_tokens for token in keyword_tokens): + classify["needs_snapshot"] = True + intent = route_intent(normalized) + if intent: + classify["needs_snapshot"] = True + classify["question_type"] = "metric" + _debug_pipeline_log(settings, "route_parsed", {"classify": classify, "normalized": normalized}) + lowered_question = f"{question} {normalized}".lower() + force_metric = bool(re.search(r"\bhow many\b|\bcount\b|\btotal\b", lowered_question)) + if any(term in lowered_question for term in ("postgres", "connections", "pvc", "ready")): + force_metric = True + + if intent: + spine_line = spine.get(intent.kind) if isinstance(spine, dict) else None + if not spine_line: + spine_line = _spine_fallback(intent, summary_lines) + spine_answer = _spine_answer(intent, spine_line) + if spine_line: + key_facts = _merge_fact_lines([spine_line], key_facts) + metric_facts = _merge_fact_lines([spine_line], metric_facts) + if spine_answer and mode in {"fast", "quick"}: + return AnswerResult(spine_answer, _default_scores(), _build_meta(mode, call_count, call_cap, limit_hit, time_budget_hit, time_budget_sec, classify, tool_hint, started)) + + cluster_terms = ( + "atlas", + "cluster", + "node", + "nodes", + "namespace", + "pod", + "workload", + "k8s", + "kubernetes", + "postgres", + "database", + "db", + "connections", + "cpu", + "ram", + "memory", + "network", + "io", + "disk", + "pvc", + "storage", + ) + has_cluster_terms = any(term in lowered_question for term in cluster_terms) + if has_cluster_terms: + classify["needs_snapshot"] = True + lowered_norm = normalized.lower() + if ("namespace" in lowered_norm and ("pod" in lowered_norm or "pods" in lowered_norm)) or re.search(r"\bmost\s+pods\b", lowered_norm) or re.search(r"\bpods\s+running\b", lowered_norm): + classify["question_type"] = "metric" + classify["needs_snapshot"] = True + if re.search(r"\b(how many|count|number of|list)\b", lowered_question): + classify["question_type"] = "metric" + if any(term in lowered_question for term in ("postgres", "connections", "db")): + classify["question_type"] = "metric" + classify["needs_snapshot"] = True + if any(term in lowered_question for term in ("pvc", "persistentvolume", "persistent volume", "storage")): + if classify.get("question_type") not in {"metric", "diagnostic"}: + classify["question_type"] = "metric" + classify["needs_snapshot"] = True + if "ready" in lowered_question and classify.get("question_type") not in {"metric", "diagnostic"}: + classify["question_type"] = "diagnostic" + hottest_terms = ("hottest", "highest", "lowest", "most") + metric_terms = ("cpu", "ram", "memory", "net", "network", "io", "disk", "load", "usage", "pod", "pods", "namespace") + if any(term in lowered_question for term in hottest_terms) and any(term in lowered_question for term in metric_terms): + classify["question_type"] = "metric" + baseline_terms = ("baseline", "delta", "trend", "increase", "decrease", "drop", "spike", "regression", "change") + if any(term in lowered_question for term in baseline_terms) and any(term in lowered_question for term in metric_terms): + classify["question_type"] = "metric" + classify["needs_snapshot"] = True + + if not classify.get("follow_up") and state and state.claims: + follow_terms = ("there", "that", "those", "these", "it", "them", "that one", "this", "former", "latter") + is_metric_query = force_metric or classify.get("question_type") in {"metric", "diagnostic"} + if not is_metric_query and ( + any(term in lowered_question for term in follow_terms) + or (len(normalized.split()) <= FOLLOWUP_SHORT_WORDS and not has_cluster_terms) + ): + classify["follow_up"] = True + + if classify.get("follow_up") and state and state.claims: + if observer: + observer("followup", "answering follow-up") + reply = await engine._answer_followup(question, state, summary, classify, plan, call_llm) + scores = await engine._score_answer(question, reply, plan, call_llm) + return AnswerResult(reply, scores, _build_meta(mode, call_count, call_cap, limit_hit, time_budget_hit, time_budget_sec, classify, tool_hint, started)) + + if observer: + observer("decompose", "decomposing") + decompose_prompt = prompts.DECOMPOSE_PROMPT.format(max_parts=plan.max_subquestions * 2) + decompose_raw = await call_llm(prompts.DECOMPOSE_SYSTEM, decompose_prompt + "\nQuestion: " + normalized, context=lexicon_ctx, model=plan.fast_model if mode == "quick" else plan.model, tag="decompose") + parts = _parse_json_list(decompose_raw) + sub_questions = _select_subquestions(parts, normalized, plan.max_subquestions) + _debug_pipeline_log(settings, "decompose_parsed", {"sub_questions": sub_questions}) + keyword_tokens = _extract_keywords(question, normalized, sub_questions=sub_questions, keywords=keywords) + + snapshot_context = "" + signal_tokens: list[str] = [] + if classify.get("needs_snapshot"): + if observer: + observer("retrieve", "scoring chunks") + chunks = _chunk_lines(summary_lines, plan.chunk_lines) + if plan.use_raw_snapshot: + raw_chunks = _raw_snapshot_chunks(snapshot_used) + if raw_chunks: + chunks.extend(raw_chunks) + kb_lines = engine._kb.chunk_lines(max_files=plan.kb_max_files, max_chars=plan.kb_max_chars) if engine._kb else [] + if kb_lines: + kb_chunks = _chunk_lines(kb_lines, plan.chunk_lines) + for idx, chunk in enumerate(kb_chunks): + chunk["id"] = f"k{idx}" + chunks.extend(kb_chunks) + metric_keys: list[str] = [] + must_chunk_ids: list[str] = [] + metric_task = None + if (classify.get("question_type") in {"metric", "diagnostic"} or force_metric) and summary_lines: + metric_ctx = {"question": normalized, "sub_questions": sub_questions, "keywords": keywords, "keyword_tokens": keyword_tokens, "summary_lines": summary_lines} + metric_task = asyncio.create_task(_select_metric_chunks(call_llm, metric_ctx, chunks, plan)) + scored_task = asyncio.create_task(_score_chunks(call_llm, chunks, normalized, sub_questions, plan)) + if metric_task: + metric_keys, must_chunk_ids = await metric_task + scored = await scored_task + selected = _select_chunks(chunks, scored, plan, keyword_tokens, must_chunk_ids) + fact_candidates = _collect_fact_candidates(selected, limit=plan.max_subquestions * 12) + key_facts = await _select_fact_lines(call_llm, normalized, fact_candidates, plan, max_lines=max(4, plan.max_subquestions * 2)) + metric_facts = [] + if classify.get("question_type") in {"metric", "diagnostic"} or force_metric: + global_metric_facts: list[str] = [] + if global_facts: + global_metric_facts = await _select_fact_lines(call_llm, normalized, global_facts, plan, max_lines=min(2, max(1, plan.max_subquestions))) + if not global_metric_facts and (keyword_tokens or question_tokens): + tokens = {tok for tok in (keyword_tokens or question_tokens) if tok and tok not in GENERIC_METRIC_TOKENS} + global_metric_facts = _rank_metric_lines(global_facts, tokens, max_lines=2) + if global_metric_facts: + key_facts = _merge_fact_lines(global_metric_facts, key_facts) + all_tokens = _merge_tokens(signal_tokens, keyword_tokens, question_tokens) + if plan.use_deep_retrieval: + if observer: + observer("retrieve", "extracting fact types") + fact_types = await _extract_fact_types(call_llm, normalized, keyword_tokens, plan) + if observer: + observer("retrieve", "deriving signals") + signals = await _derive_signals(call_llm, normalized, fact_types, plan) + if isinstance(signals, list): + signal_tokens = [str(item) for item in signals if item] + all_tokens = _merge_tokens(signal_tokens, keyword_tokens, question_tokens) + if observer: + observer("retrieve", "scanning chunks") + candidate_lines: list[str] = [] + if signals: + for chunk in selected: + chunk_lines = chunk["text"].splitlines() + if not chunk_lines: + continue + hits = await _scan_chunk_for_signals(call_llm, normalized, signals, chunk_lines, plan) + if hits: + candidate_lines.extend(hits) + candidate_lines = list(dict.fromkeys(candidate_lines)) + if candidate_lines: + if observer: + observer("retrieve", "pruning candidates") + metric_facts = await _prune_metric_candidates(call_llm, normalized, candidate_lines, plan, plan.metric_retries) + if metric_facts: + key_facts = _merge_fact_lines(metric_facts, key_facts) + if settings.debug_pipeline: + _debug_pipeline_log(settings, "metric_facts_selected", {"facts": metric_facts}) + if not metric_facts: + if observer: + observer("retrieve", "fallback metric selection") + token_set = {tok for tok in all_tokens if tok and tok not in GENERIC_METRIC_TOKENS} + fallback_candidates = _rank_metric_lines(summary_lines, token_set, max_lines=200) + if fallback_candidates: + metric_facts = await _select_fact_lines(call_llm, normalized, fallback_candidates, plan, max_lines=max(2, plan.max_subquestions)) + if not metric_facts and fallback_candidates: + metric_facts = fallback_candidates[: max(2, plan.max_subquestions)] + if metric_keys: + key_lines = _lines_for_metric_keys(summary_lines, metric_keys, max_lines=plan.max_subquestions * 3) + if key_lines: + metric_facts = _merge_fact_lines(key_lines, metric_facts) + if metric_facts: + metric_cover_tokens = [tok for tok in keyword_tokens if tok and tok not in GENERIC_METRIC_TOKENS] + if not metric_cover_tokens: + metric_cover_tokens = [tok for tok in question_tokens if tok and tok not in GENERIC_METRIC_TOKENS] + metric_facts = _ensure_token_coverage(metric_facts, metric_cover_tokens or all_tokens, summary_lines, max_add=plan.max_subquestions) + if metric_cover_tokens: + ranked_metric_lines = _rank_metric_lines(summary_lines, set(metric_cover_tokens), max_lines=max(1, plan.max_subquestions)) + if ranked_metric_lines: + metric_facts = _merge_fact_lines(ranked_metric_lines, metric_facts) + if metric_facts and not _has_keyword_overlap(metric_facts, keyword_tokens): + best_line = _best_keyword_line(summary_lines, keyword_tokens) + if best_line: + metric_facts = _merge_fact_lines([best_line], metric_facts) + if metric_facts: + key_facts = _merge_fact_lines(metric_facts, key_facts) + if global_metric_facts: + metric_facts = _merge_fact_lines(global_metric_facts, metric_facts) + if (classify.get("question_type") in {"metric", "diagnostic"} or force_metric) and not metric_facts and key_facts: + metric_facts = key_facts + if key_facts: + key_facts = _ensure_token_coverage(key_facts, _merge_tokens(keyword_tokens, question_tokens), summary_lines, max_add=plan.max_subquestions) + facts_used = list(dict.fromkeys(key_facts)) if key_facts else list(dict.fromkeys(metric_facts)) + snapshot_context = "ClusterSnapshot:\n" + "\n".join([chunk["text"] for chunk in selected]) + combined_facts = _merge_fact_lines(global_facts, key_facts) if global_facts else key_facts + if combined_facts: + snapshot_context = "KeyFacts:\n" + "\n".join(combined_facts) + "\n\n" + snapshot_context + + context = _join_context([kb_summary, _format_runbooks(runbooks), snapshot_context, history_ctx if classify.get("follow_up") else ""]) + + if plan.use_tool and classify.get("needs_tool"): + if observer: + observer("tool", "suggesting tools") + tool_prompt = prompts.TOOL_PROMPT + "\nQuestion: " + normalized + tool_raw = await call_llm(prompts.TOOL_SYSTEM, tool_prompt, context=context, model=plan.fast_model, tag="tool") + tool_hint = _parse_json_block(tool_raw, fallback={}) + + if observer: + observer("subanswers", "drafting subanswers") + async def _subanswer_for(subq: str) -> str: + sub_prompt = prompts.SUBANSWER_PROMPT + "\nQuestion: " + subq + if plan.subanswer_retries > 1: + candidates = await _gather_limited( + [call_llm(prompts.ANSWER_SYSTEM, sub_prompt, context=context, model=plan.model, tag="subanswer") for _ in range(plan.subanswer_retries)], + plan.parallelism, + ) + best_idx = await _select_best_candidate(call_llm, subq, candidates, plan, "subanswer_select") + return candidates[best_idx] + return await call_llm(prompts.ANSWER_SYSTEM, sub_prompt, context=context, model=plan.model, tag="subanswer") + + subanswers: list[str] = [] + if plan.parallelism > 1 and len(sub_questions) > 1: + subanswers = await _gather_limited([_subanswer_for(subq) for subq in sub_questions], plan.parallelism) + else: + for subq in sub_questions: + subanswers.append(await _subanswer_for(subq)) + + if observer: + observer("synthesize", "synthesizing") + reply, scores, claims = await finalize_answer( + engine=engine, + call_llm=call_llm, + normalized=normalized, + subanswers=subanswers, + context=context, + classify=classify, + plan=plan, + summary=summary, + summary_lines=summary_lines, + metric_facts=metric_facts, + key_facts=key_facts, + facts_used=facts_used, + allowed_nodes=allowed_nodes, + allowed_namespaces=allowed_namespaces, + runbook_paths=runbook_paths, + lowered_question=lowered_question, + force_metric=force_metric, + keyword_tokens=keyword_tokens, + question_tokens=question_tokens, + snapshot_context=snapshot_context, + observer=observer, + mode=mode, + metric_keys=metric_keys if 'metric_keys' in locals() else None, + ) + + + except LLMTimeBudgetExceeded: + time_budget_hit = True + if not reply: + budget = max(1, round(time_budget_sec)) if time_budget_sec > 0 else 0 + budget_text = f"{budget}s" if budget else "its configured" + if mode in {"quick", "fast"}: + reply = f"Quick mode hit {budget_text} time budget before finishing. Try atlas-smart for a deeper answer." + elif mode == "smart": + reply = f"Smart mode hit {budget_text} time budget before finishing. Try atlas-genius or ask a narrower follow-up." + else: + reply = "I ran out of time before I could finish this answer." + scores = _default_scores() + except LLMLimitReached: + if not reply: + reply = "I started working on this but hit my reasoning limit. Ask again with 'Run limitless' for a deeper pass." + scores = _default_scores() + finally: + elapsed = round(time.monotonic() - started, 2) + log.info( + "atlasbot_answer", + extra={ + "extra": { + "mode": mode, + "seconds": elapsed, + "llm_calls": call_count, + "limit": call_cap, + "limit_hit": limit_hit, + "time_budget_sec": time_budget_sec, + "time_budget_hit": time_budget_hit, + } + }, + ) + + if limit_hit and "run limitless" not in reply.lower(): + reply = reply.rstrip() + "\n\nNote: I hit my reasoning limit. Ask again with 'Run limitless' for a deeper pass." + + if conversation_id and claims: + engine._store_state(conversation_id, claims, summary, snapshot_used, pin_snapshot) + + return AnswerResult( + reply, + scores, + _build_meta(mode, call_count, call_cap, limit_hit, time_budget_hit, time_budget_sec, classify, tool_hint, started), + ) diff --git a/atlasbot/engine/answerer/workflow_post.py b/atlasbot/engine/answerer/workflow_post.py new file mode 100644 index 0000000..81190bb --- /dev/null +++ b/atlasbot/engine/answerer/workflow_post.py @@ -0,0 +1,170 @@ +from __future__ import annotations + +import json +import re +from collections.abc import Callable +from typing import Any + +from atlasbot.llm import prompts + +from ._base import * +from .common import * +from .post import * +from .post_ext import * +from .retrieval import * +from .spine import * + + +async def finalize_answer(*, engine: Any, call_llm: Callable[..., Any], normalized: str, subanswers: list[str], context: str, classify: dict[str, Any], plan: ModePlan, summary: dict[str, Any], summary_lines: list[str], metric_facts: list[str], key_facts: list[str], facts_used: list[str], allowed_nodes: list[str], allowed_namespaces: list[str], runbook_paths: list[str], lowered_question: str, force_metric: bool, keyword_tokens: list[str], question_tokens: list[str], snapshot_context: str, observer: Callable[[str, str], None] | None, mode: str, metric_keys: list[str] | None = None) -> tuple[str, AnswerScores, list[ClaimItem]]: # noqa: C901 + """Synthesize and post-process the final answer.""" + + reply = await engine._synthesize_answer(normalized, subanswers, context, classify, plan, call_llm) + + unknown_nodes = _find_unknown_nodes(reply, allowed_nodes) + unknown_namespaces = _find_unknown_namespaces(reply, allowed_namespaces) + runbook_fix = _needs_runbook_fix(reply, runbook_paths) + runbook_needed = _needs_runbook_reference(normalized, runbook_paths, reply) + needs_evidence = _needs_evidence_fix(reply, classify) + hardware_terms = ("rpi", "raspberry", "jetson", "amd64", "arm64", "hardware") + hardware_line = _line_starting_with(summary_lines, "hardware_nodes:") + if any(term in lowered_question for term in hardware_terms) and hardware_line: + needs_evidence = True + if metric_facts and (classify.get("question_type") in {"metric", "diagnostic"} or force_metric) and not _reply_matches_metric_facts(reply, metric_facts, _merge_tokens(keyword_tokens, question_tokens)): + needs_evidence = True + if classify.get("question_type") in {"open_ended", "planning"} and metric_facts: + needs_evidence = True + resolved_runbook = None + if runbook_paths and (runbook_fix or runbook_needed): + resolver_prompt = prompts.RUNBOOK_SELECT_PROMPT + "\nQuestion: " + normalized + resolver_raw = await call_llm(prompts.RUNBOOK_SELECT_SYSTEM, resolver_prompt, context="AllowedRunbooks:\n" + "\n".join(runbook_paths), model=plan.fast_model, tag="runbook_select") + resolver = _parse_json_block(resolver_raw, fallback={}) + candidate = resolver.get("path") if isinstance(resolver.get("path"), str) else None + if candidate and candidate in runbook_paths: + resolved_runbook = candidate + + if (snapshot_context and needs_evidence) or unknown_nodes or unknown_namespaces or runbook_fix or runbook_needed: + if observer: + observer("evidence_fix", "repairing missing evidence") + extra_bits = [] + if unknown_nodes: + extra_bits.append("UnknownNodes: " + ", ".join(sorted(unknown_nodes))) + if unknown_namespaces: + extra_bits.append("UnknownNamespaces: " + ", ".join(sorted(unknown_namespaces))) + if runbook_paths: + extra_bits.append("AllowedRunbooks: " + ", ".join(runbook_paths)) + if resolved_runbook: + extra_bits.append("ResolvedRunbook: " + resolved_runbook) + if metric_facts: + extra_bits.append("MustUseFacts: " + "; ".join(metric_facts[:4])) + if hardware_line: + extra_bits.append("HardwareNodes: " + hardware_line) + if allowed_nodes: + extra_bits.append("AllowedNodes: " + ", ".join(allowed_nodes)) + if allowed_namespaces: + extra_bits.append("AllowedNamespaces: " + ", ".join(allowed_namespaces)) + fix_prompt = prompts.EVIDENCE_FIX_PROMPT + "\nQuestion: " + normalized + "\nDraft: " + reply + ("\n" + "\n".join(extra_bits) if extra_bits else "") + reply = await call_llm(prompts.EVIDENCE_FIX_SYSTEM, fix_prompt, context=context, model=plan.model, tag="evidence_fix") + if metric_facts and not _reply_matches_metric_facts(reply, metric_facts, _merge_tokens(keyword_tokens, question_tokens)): + enforce_prompt = prompts.EVIDENCE_FIX_PROMPT + "\nQuestion: " + normalized + "\nDraft: " + reply + "\nMustIncludeFacts: " + "; ".join(metric_facts[:6]) + "\nInstruction: The answer must include all MustIncludeFacts items." + reply = await call_llm(prompts.EVIDENCE_FIX_SYSTEM, enforce_prompt, context=context, model=plan.model, tag="evidence_fix_enforce") + + if metric_facts and not _reply_matches_metric_facts(reply, metric_facts, _merge_tokens(keyword_tokens, question_tokens)): + direct_candidates = _lines_for_metric_keys(summary_lines, metric_keys, max_lines=plan.max_subquestions * 3) if 'metric_keys' in locals() and metric_keys else summary_lines + direct_line = _select_metric_line(direct_candidates, normalized, _merge_tokens(keyword_tokens, question_tokens)) + if direct_line: + direct_prompt = f"Question: {normalized}\nFact: {direct_line}\nAnswer using the fact." + reply = await call_llm(prompts.ANSWER_SYSTEM, direct_prompt, context="", model=plan.fast_model, tag="metric_direct") + if (mode == "quick" and any(term in normalized.lower() for term in ("how many", "count", "total"))) or not _reply_matches_metric_facts(reply, [direct_line], _merge_tokens(keyword_tokens, question_tokens)): + reply = _format_direct_metric_line(direct_line) + + if "raspberry" in lowered_question and "not" in lowered_question: + non_rpi = _non_rpi_nodes(summary) + if non_rpi: + reply = _format_hardware_groups(non_rpi, "Non-Raspberry Pi nodes") + if unknown_nodes or unknown_namespaces: + refreshed_nodes = _find_unknown_nodes(reply, allowed_nodes) + refreshed_namespaces = _find_unknown_namespaces(reply, allowed_namespaces) + if refreshed_nodes or refreshed_namespaces: + reply = _strip_unknown_entities(reply, refreshed_nodes, refreshed_namespaces) + if runbook_paths and resolved_runbook and _needs_runbook_reference(normalized, runbook_paths, reply): + if observer: + observer("runbook_enforce", "enforcing runbook path") + enforce_prompt = prompts.RUNBOOK_ENFORCE_PROMPT.format(path=resolved_runbook) + reply = await call_llm(prompts.RUNBOOK_ENFORCE_SYSTEM, enforce_prompt + "\nAnswer: " + reply, context=context, model=plan.model, tag="runbook_enforce") + if runbook_paths: + invalid = [token for token in re.findall(r"runbooks/[A-Za-z0-9._-]+", reply) if token.lower() not in {p.lower() for p in runbook_paths}] + if invalid: + if observer: + observer("runbook_enforce", "replacing invalid runbook path") + resolver_prompt = prompts.RUNBOOK_SELECT_PROMPT + "\nQuestion: " + normalized + resolver_raw = await call_llm(prompts.RUNBOOK_SELECT_SYSTEM, resolver_prompt, context="AllowedRunbooks:\n" + "\n".join(runbook_paths), model=plan.fast_model, tag="runbook_select") + resolver = _parse_json_block(resolver_raw, fallback={}) + candidate = resolver.get("path") if isinstance(resolver.get("path"), str) else None + if not (candidate and candidate in runbook_paths): + candidate = _best_runbook_match(invalid[0], runbook_paths) + if candidate and candidate in runbook_paths: + enforce_prompt = prompts.RUNBOOK_ENFORCE_PROMPT.format(path=candidate) + reply = await call_llm(prompts.RUNBOOK_ENFORCE_SYSTEM, enforce_prompt + "\nAnswer: " + reply, context=context, model=plan.model, tag="runbook_enforce") + reply = _strip_unknown_entities(reply, unknown_nodes, unknown_namespaces) + + if facts_used and _needs_evidence_guard(reply, facts_used): + if observer: + observer("evidence_guard", "tightening unsupported claims") + use_guard = True + if mode in {"smart", "genius"}: + decision = await _contradiction_decision(ContradictionContext(call_llm, normalized, reply, facts_used, plan), attempts=3 if mode == "genius" else 1) + use_guard = decision.get("use_facts", True) + if use_guard: + guard_prompt = prompts.EVIDENCE_GUARD_PROMPT + "\nQuestion: " + normalized + "\nDraft: " + reply + "\nFactsUsed:\n" + "\n".join(facts_used) + reply = await call_llm(prompts.EVIDENCE_GUARD_SYSTEM, guard_prompt, context=context, model=plan.model, tag="evidence_guard") + + if _needs_focus_fix(normalized, reply, classify): + if observer: + observer("focus_fix", "tightening answer") + reply = await call_llm(prompts.EVIDENCE_FIX_SYSTEM, prompts.FOCUS_FIX_PROMPT + "\nQuestion: " + normalized + "\nDraft: " + reply, context=context, model=plan.model, tag="focus_fix") + if not metric_facts or not _has_keyword_overlap(metric_facts, keyword_tokens): + best_line = _best_keyword_line(summary_lines, keyword_tokens) + if best_line: + reply = f"Latest metrics: {best_line}." + if (classify.get("question_type") in {"metric", "diagnostic"} or force_metric) and metric_facts: + best_line = None + lowered_keywords = [kw.lower() for kw in keyword_tokens if kw] + for line in metric_facts: + if any(kw in line.lower() for kw in lowered_keywords): + best_line = line + break + best_line = best_line or metric_facts[0] + reply_numbers = set(re.findall(r"\d+(?:\.\d+)?", reply)) + fact_numbers = set(re.findall(r"\d+(?:\.\d+)?", " ".join(metric_facts))) + if not reply_numbers or (fact_numbers and not (reply_numbers & fact_numbers)): + reply = f"Latest metrics: {best_line}." + + if _should_use_insight_guard(classify): + if observer: + observer("insight_guard", "checking for concrete signals") + reply = await _apply_insight_guard(InsightGuardInput(question=normalized, reply=reply, classify=classify, context=context, plan=plan, call_llm=call_llm, facts=metric_facts or key_facts)) + + if plan.use_critic: + if observer: + observer("critic", "reviewing") + critic_prompt = prompts.CRITIC_PROMPT + "\nQuestion: " + normalized + "\nAnswer: " + reply + critic_raw = await call_llm(prompts.CRITIC_SYSTEM, critic_prompt, context=context, model=plan.model, tag="critic") + critic = _parse_json_block(critic_raw, fallback={}) + if critic.get("issues"): + revise_prompt = prompts.REVISION_PROMPT + "\nQuestion: " + normalized + "\nDraft: " + reply + "\nCritique: " + json.dumps(critic) + reply = await call_llm(prompts.REVISION_SYSTEM, revise_prompt, context=context, model=plan.model, tag="revise") + + if plan.use_gap: + if observer: + observer("gap", "checking gaps") + gap_prompt = prompts.EVIDENCE_GAP_PROMPT + "\nQuestion: " + normalized + "\nAnswer: " + reply + gap_raw = await call_llm(prompts.GAP_SYSTEM, gap_prompt, context=context, model=plan.fast_model, tag="gap") + gap = _parse_json_block(gap_raw, fallback={}) + note = str(gap.get("note") or "").strip() + if note: + reply = f"{reply}\n\n{note}" + + reply = await engine._dedup_reply(reply, plan, call_llm, tag="dedup") + scores = await engine._score_answer(normalized, reply, plan, call_llm) + claims = await engine._extract_claims(normalized, reply, summary, facts_used, call_llm) + return reply, scores, claims diff --git a/atlasbot/engine/intent_router.py b/atlasbot/engine/intent_router.py index 869fefa..801fd5d 100644 --- a/atlasbot/engine/intent_router.py +++ b/atlasbot/engine/intent_router.py @@ -1,35 +1,46 @@ from __future__ import annotations -from dataclasses import dataclass import re +from dataclasses import dataclass @dataclass(frozen=True) class IntentMatch: + """Describe the best cluster intent match for a user question.""" + kind: str score: int -_COUNT_TERMS = r"(how\\s+many|count|number\\s+of|total|totals|tally|amount\\s+of|quantity|sum\\s+of|overall|in\\s+total|all\\s+up)" -_NODE_TERMS = r"(nodes?|workers?|worker\\s+nodes?|cluster\\s+nodes?|machines?|hosts?|members?|instances?|servers?|agents?|control[-\\s]?plane|control\\s+plane)" -_READY_TERMS = r"(ready|unready|not\\s+ready|down|offline|not\\s+responding|missing|lost|gone|drain(?:ed|ing)?|cordon(?:ed|ing)?)" +_COUNT_TERMS = r"(how\s+many|count|number\s+of|total|totals|tally|amount\s+of|quantity|sum\s+of|overall|in\s+total|all\s+up)" +_NODE_TERMS = r"(nodes?|workers?|worker\s+nodes?|cluster\s+nodes?|machines?|hosts?|members?|instances?|servers?|agents?|control[-\s]?plane|control\s+plane)" +_READY_TERMS = r"(ready|unready|not\s+ready|down|offline|not\s+responding|missing|lost|gone|drain(?:ed|ing)?|cordon(?:ed|ing)?)" _HOTTEST_TERMS = r"(hottest|hot|highest|max(?:imum)?|peak|top|most|worst|spikiest|heaviest|largest|biggest|noisiest|loudest)" -_CPU_TERMS = r"(cpu|processor|processors|compute|core|cores|load|load\\s+avg|load\\s+average|util(?:ization)?|usage)" +_CPU_TERMS = r"(cpu|processor|processors|compute|core|cores|load|load\s+avg|load\s+average|util(?:ization)?|usage)" _RAM_TERMS = r"(ram|memory|mem|heap|rss|resident|swap)" _NET_TERMS = r"(net|network|bandwidth|throughput|traffic|rx|tx|ingress|egress|bits|bytes|packets|pps|bps)" -_IO_TERMS = r"(\\bio\\b|i/o|disk\\s+io|disk\\s+activity|read/?write|storage\\s+io|iops|latency)" -_DISK_TERMS = r"(disk|storage|volume|pvc|filesystem|fs|capacity|\\bspace\\b|full|usage)" -_PG_TERMS = r"(postgres|postgresql|pg\\b|database|db|sql|psql)" -_CONN_TERMS = r"(connections?|conn|pool|sessions?|clients?|active\\s+connections?|open\\s+connections?)" -_DB_HOT_TERMS = r"(hottest|busiest|most|largest|top|heaviest|noisiest|highest\\s+load)" -_NAMESPACE_TERMS = r"(namespace|namespaces|ns\\b|tenant|workload\\s+namespace)" +_IO_TERMS = r"(\bio\b|i/o|disk\s+io|disk\s+activity|read/?write|storage\s+io|iops|latency)" +_DISK_TERMS = r"(disk|storage|volume|pvc|filesystem|fs|capacity|\bspace\b|full|usage)" +_PG_TERMS = r"(postgres|postgresql|pg\b|database|db|sql|psql)" +_CONN_TERMS = r"(connections?|conn|pool|sessions?|clients?|active\s+connections?|open\s+connections?)" +_DB_HOT_TERMS = r"(hottest|busiest|most|largest|top|heaviest|noisiest|highest\s+load)" +_NAMESPACE_TERMS = r"(namespace|namespaces|ns\b|tenant|workload\s+namespace)" _PODS_TERMS = r"(pods?|workloads?|tasks?|containers?|deployments?|jobs?|cronjobs?|daemonsets?|statefulsets?)" -_NON_RPI_TERMS = r"(non[-\\s]?raspberry|not\\s+raspberry|non[-\\s]?rpi|not\\s+rpi|amd64|x86|x86_64|intel|ryzen|jetson|arm64\\b(?!.*rpi))" -_PRESSURE_TERMS = r"(pressure|overload|hotspot|bottleneck|saturation|headroom|strain|stress|critical|warning|at\\s+capacity|near\\s+limit)" -_HARDWARE_TERMS = r"(hardware|arch(?:itecture)?|platform|mix|profile|node\\s+types?)" +_NON_RPI_TERMS = r"(non[-\s]?raspberry|not\s+raspberry|non[-\s]?rpi|not\s+rpi|amd64|x86|x86_64|intel|ryzen|jetson|arm64\b(?!.*rpi))" +_PRESSURE_TERMS = r"(pressure|overload|hotspot|bottleneck|saturation|headroom|strain|stress|critical|warning|at\s+capacity|near\s+limit)" +_HARDWARE_TERMS = r"(hardware|arch(?:itecture)?|platform|mix|profile|node\s+types?)" def route_intent(question: str) -> IntentMatch | None: + """Classify a question into a deterministic cluster intent. + + Input: + - `question`: user text to inspect. + + Output: + - the highest-confidence `IntentMatch`, or `None` when no intent fits. + """ + text = (question or "").lower() if not text: return None @@ -44,13 +55,13 @@ def route_intent(question: str) -> IntentMatch | None: return any(_has(pat) for pat in patterns) intents = [ - (lambda: _all(_COUNT_TERMS) and (_has(_NODE_TERMS) or "cluster" in text), IntentMatch("nodes_count", 90)), ( lambda: _all(_READY_TERMS) and (_any(_NODE_TERMS) or "cluster" in text or "workers" in text), IntentMatch("nodes_ready", 85), ), + (lambda: _all(_COUNT_TERMS) and (_has(_NODE_TERMS) or "cluster" in text), IntentMatch("nodes_count", 90)), (lambda: _all(_NON_RPI_TERMS) and (_any(_NODE_TERMS) or "cluster" in text), IntentMatch("nodes_non_rpi", 80)), - (lambda: _all(_HARDWARE_TERMS) and (_has(_NODE_TERMS) or "cluster" in text), IntentMatch("hardware_mix", 75)), + (lambda: _all(_HARDWARE_TERMS) and (_has(_NODE_TERMS) or "cluster" in text or "mix" in text), IntentMatch("hardware_mix", 75)), (lambda: _all(_HOTTEST_TERMS, _CPU_TERMS), IntentMatch("hottest_cpu", 80)), (lambda: _all(_HOTTEST_TERMS, _RAM_TERMS), IntentMatch("hottest_ram", 80)), (lambda: _all(_HOTTEST_TERMS, _NET_TERMS), IntentMatch("hottest_net", 80)), diff --git a/atlasbot/knowledge/loader.py b/atlasbot/knowledge/loader.py index 8310ed2..52990f8 100644 --- a/atlasbot/knowledge/loader.py +++ b/atlasbot/knowledge/loader.py @@ -7,6 +7,8 @@ log = logging.getLogger(__name__) class KnowledgeBase: + """Load Atlas knowledge-base files and expose summary snippets.""" + def __init__(self, base_dir: str) -> None: self._base = Path(base_dir) if base_dir else None self._atlas: dict[str, Any] = {} @@ -14,6 +16,8 @@ class KnowledgeBase: self._loaded = False def load(self) -> None: + """Load catalog files once so subsequent reads stay cheap.""" + if self._loaded or not self._base: return self._atlas = self._read_json(self._base / "catalog" / "atlas.json") @@ -30,6 +34,8 @@ class KnowledgeBase: return {} def summary(self) -> str: + """Return a short human-readable KB summary for prompt context.""" + self.load() if not self._atlas: return "" @@ -42,12 +48,14 @@ class KnowledgeBase: if services: parts.append(f"Services indexed: {len(services)}.") if isinstance(self._atlas, dict): - keys = [key for key in self._atlas.keys() if key not in {"sources"}] + keys = [key for key in self._atlas if key not in {"sources"}] if keys: parts.append(f"Atlas keys: {', '.join(sorted(keys)[:8])}.") return " ".join(parts) def runbook_titles(self, *, limit: int = 5) -> str: + """Render the top runbook titles for prompt context.""" + self.load() if not self._runbooks: return "" @@ -64,6 +72,8 @@ class KnowledgeBase: return "Relevant runbooks:\n" + "\n".join(titles[:limit]) def runbook_paths(self, *, limit: int = 10) -> list[str]: + """Return the runbook paths used for exact-path enforcement.""" + self.load() if not self._runbooks: return [] @@ -77,6 +87,8 @@ class KnowledgeBase: return paths[:limit] def chunk_lines(self, *, max_files: int = 20, max_chars: int = 6000) -> list[str]: + """Collect KB excerpts into prompt-sized chunks.""" + self.load() if not self._base: return [] diff --git a/atlasbot/llm/client.py b/atlasbot/llm/client.py index 9e4253b..1d33484 100644 --- a/atlasbot/llm/client.py +++ b/atlasbot/llm/client.py @@ -17,6 +17,8 @@ class LLMError(RuntimeError): class LLMClient: + """Wrap the Ollama chat endpoint with retries and fallback-model support.""" + def __init__(self, settings: Settings) -> None: self._settings = settings self._timeout = settings.ollama_timeout_sec @@ -37,6 +39,8 @@ class LLMClient: model: str | None = None, timeout_sec: float | None = None, ) -> str: + """Send a chat request and return the model content text.""" + payload = { "model": model or self._settings.ollama_model, "messages": messages, @@ -77,6 +81,8 @@ class LLMClient: def build_messages(system: str, prompt: str, *, context: str | None = None) -> list[dict[str, str]]: + """Assemble the minimal chat message list used by the answer pipeline.""" + messages: list[dict[str, str]] = [{"role": "system", "content": system}] if context: messages.append({"role": "user", "content": "Context (grounded facts):\n" + context}) @@ -85,6 +91,8 @@ def build_messages(system: str, prompt: str, *, context: str | None = None) -> l def parse_json(text: str, *, fallback: dict[str, Any] | None = None) -> dict[str, Any]: + """Parse a JSON blob from model output and fall back to a safe default.""" + try: raw = text.strip() if raw.startswith("`"): diff --git a/atlasbot/llm/prompts.py b/atlasbot/llm/prompts.py index 1b987a4..da045d2 100644 --- a/atlasbot/llm/prompts.py +++ b/atlasbot/llm/prompts.py @@ -253,7 +253,7 @@ CONTRADICTION_PROMPT = ( "Question: {question}\n" "Draft: {draft}\n" "FactsUsed:\n{facts}\n\n" - "Return JSON: {\"use_facts\": true|false, \"confidence\": 0-100, \"reason\": \"...\"}" + "Return JSON: {{\"use_facts\": true|false, \"confidence\": 0-100, \"reason\": \"...\"}}" ) CANDIDATE_SELECT_SYSTEM = ( diff --git a/atlasbot/logging.py b/atlasbot/logging.py index aff3867..a42c70a 100644 --- a/atlasbot/logging.py +++ b/atlasbot/logging.py @@ -1,13 +1,17 @@ import json import logging import sys -from datetime import datetime, timezone +from datetime import UTC, datetime class JsonFormatter(logging.Formatter): + """Emit structured log records for the atlasbot services.""" + def format(self, record: logging.LogRecord) -> str: + """Render a log record as JSON for downstream ingestion.""" + payload = { - "timestamp": datetime.now(timezone.utc).isoformat(), + "timestamp": datetime.now(UTC).isoformat(), "level": record.levelname.lower(), "logger": record.name, "message": record.getMessage(), @@ -21,6 +25,8 @@ class JsonFormatter(logging.Formatter): def configure_logging(level: str = "INFO") -> None: + """Install JSON logging on the process root logger.""" + root = logging.getLogger() root.setLevel(level.upper()) handler = logging.StreamHandler(sys.stdout) diff --git a/atlasbot/main.py b/atlasbot/main.py index 3fe0c35..990155f 100644 --- a/atlasbot/main.py +++ b/atlasbot/main.py @@ -17,6 +17,8 @@ log = logging.getLogger(__name__) def _build_engine(settings) -> AnswerEngine: + """Construct the answer engine from the configured backends.""" + kb = KnowledgeBase(settings.kb_dir) snapshot = SnapshotProvider(settings) llm = LLMClient(settings) @@ -24,6 +26,8 @@ def _build_engine(settings) -> AnswerEngine: async def main() -> None: + """Start the HTTP API, Matrix bots, and queue worker.""" + settings = load_settings() configure_logging("INFO") @@ -45,14 +49,7 @@ async def main() -> None: queue = QueueManager(settings, handler) await queue.start() - async def answer_handler( # noqa: PLR0913 - question: str, - mode: str, - history=None, - conversation_id=None, - snapshot_pin: bool | None = None, - observer=None, - ) -> AnswerResult: + async def answer_handler(question: str, mode: str, history=None, conversation_id=None, snapshot_pin: bool | None = None, observer=None) -> AnswerResult: if settings.queue_enabled: payload = await queue.submit( { @@ -86,6 +83,8 @@ async def main() -> None: def result_scores(payload: dict[str, object]) -> AnswerScores: + """Coerce a queue payload into the public `AnswerScores` shape.""" + scores = payload.get("scores") if isinstance(payload, dict) else None if isinstance(scores, dict): try: diff --git a/atlasbot/matrix/bot.py b/atlasbot/matrix/bot.py index b79aa16..5a0778a 100644 --- a/atlasbot/matrix/bot.py +++ b/atlasbot/matrix/bot.py @@ -15,11 +15,15 @@ log = logging.getLogger(__name__) class MatrixClient: + """Wrap the Matrix client endpoints used by the bot runtime.""" + def __init__(self, settings: Settings, bot: MatrixBotConfig) -> None: self._settings = settings self._bot = bot async def login(self) -> str: + """Exchange bot credentials for a Matrix access token.""" + payload = { "type": "m.login.password", "identifier": {"type": "m.id.user", "user": self._bot.username}, @@ -33,6 +37,8 @@ class MatrixClient: return data.get("access_token", "") async def resolve_room(self, token: str) -> str: + """Resolve the configured room alias into a room id.""" + alias = quote(self._settings.room_alias, safe="") url = f"{self._settings.matrix_base}/_matrix/client/v3/directory/room/{alias}" headers = {"Authorization": f"Bearer {token}"} @@ -50,12 +56,16 @@ class MatrixClient: return data.get("room_id", "") async def join_room(self, token: str, room_id: str) -> None: + """Join the target room if the bot is not already present.""" + url = f"{self._settings.matrix_base}/_matrix/client/v3/rooms/{room_id}/join" headers = {"Authorization": f"Bearer {token}"} async with httpx.AsyncClient(timeout=15.0) as client: await client.post(url, headers=headers) async def send_message(self, token: str, room_id: str, text: str) -> None: + """Send a plain text message to the Matrix room.""" + url = f"{self._settings.matrix_base}/_matrix/client/v3/rooms/{room_id}/send/m.room.message" headers = {"Authorization": f"Bearer {token}"} payload = {"msgtype": "m.text", "body": text} @@ -63,6 +73,8 @@ class MatrixClient: await client.post(url, json=payload, headers=headers) async def sync(self, token: str, since: str | None) -> dict[str, Any]: + """Fetch the incremental Matrix sync payload.""" + base = f"{self._settings.matrix_base}/_matrix/client/v3/sync" params = {"timeout": 30000} if since: @@ -75,17 +87,9 @@ class MatrixClient: class MatrixBot: - def __init__( - self, - settings: Settings, - bot: MatrixBotConfig, - engine: AnswerEngine, - answer_handler: Callable[ - [str, str, list[dict[str, str]] | None, str | None, Callable[[str, str], None] | None], - Awaitable[AnswerResult], - ] - | None = None, - ) -> None: + """Drive Matrix conversation handling and heartbeat replies.""" + + def __init__(self, settings: Settings, bot: MatrixBotConfig, engine: AnswerEngine, answer_handler: Callable[[str, str, list[dict[str, str]] | None, str | None, Callable[[str, str], None] | None], Awaitable[AnswerResult]] | None = None) -> None: self._settings = settings self._bot = bot self._engine = engine @@ -94,6 +98,8 @@ class MatrixBot: self._history: dict[str, list[dict[str, str]]] = {} async def run(self) -> None: + """Continuously bootstrap, sync, and answer Matrix events.""" + while True: try: token = await self._client.login() diff --git a/atlasbot/queue/nats.py b/atlasbot/queue/nats.py index 418a8b2..6d83cd5 100644 --- a/atlasbot/queue/nats.py +++ b/atlasbot/queue/nats.py @@ -1,7 +1,8 @@ import asyncio import json import logging -from typing import Any, Awaitable, Callable +from collections.abc import Awaitable, Callable +from typing import Any from nats.aio.client import Client as NATS from nats.js.errors import NotFoundError @@ -12,6 +13,8 @@ log = logging.getLogger(__name__) class QueueManager: + """Manage optional NATS-backed work queue processing.""" + def __init__(self, settings: Settings, handler: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> None: self._settings = settings self._handler = handler @@ -20,6 +23,8 @@ class QueueManager: self._worker_task: asyncio.Task | None = None async def start(self) -> None: + """Connect to NATS and start the worker loop when queueing is enabled.""" + if not self._settings.queue_enabled: return self._nc = NATS() @@ -29,12 +34,16 @@ class QueueManager: self._worker_task = asyncio.create_task(self._worker_loop()) async def stop(self) -> None: + """Drain the NATS connection and cancel background work.""" + if self._worker_task: self._worker_task.cancel() if self._nc: await self._nc.drain() async def submit(self, payload: dict[str, Any]) -> dict[str, Any]: + """Submit work to NATS or fall back to direct handling.""" + if not self._settings.queue_enabled: return await self._handler(payload) if not self._nc or not self._js: diff --git a/atlasbot/snapshot/builder.py b/atlasbot/snapshot/builder.py deleted file mode 100644 index 7543197..0000000 --- a/atlasbot/snapshot/builder.py +++ /dev/null @@ -1,1992 +0,0 @@ -import logging -import time -from typing import Any - -import httpx - -from atlasbot.config import Settings - -log = logging.getLogger(__name__) - -PVC_USAGE_CRITICAL = 90 - -_BYTES_KB = 1024 -_BYTES_MB = 1024 * 1024 -_BYTES_GB = 1024 * 1024 * 1024 -_VALUE_PAIR_LEN = 2 - - -class SnapshotProvider: - def __init__(self, settings: Settings) -> None: - self._settings = settings - self._cache: dict[str, Any] = {} - self._cache_ts = 0.0 - - def _cache_valid(self) -> bool: - return time.monotonic() - self._cache_ts < max(5, self._settings.snapshot_ttl_sec) - - def get(self) -> dict[str, Any] | None: - if self._cache and self._cache_valid(): - return self._cache - if not self._settings.ariadne_state_url: - return self._cache or None - headers = {} - if self._settings.ariadne_state_token: - headers["x-internal-token"] = self._settings.ariadne_state_token - try: - resp = httpx.get(self._settings.ariadne_state_url, headers=headers, timeout=10.0) - resp.raise_for_status() - payload = resp.json() - if isinstance(payload, dict): - self._cache = payload - self._cache_ts = time.monotonic() - return payload - except Exception as exc: - log.warning("snapshot fetch failed", extra={"extra": {"error": str(exc)}}) - return self._cache or None - - -def _node_usage_top(series: list[dict[str, Any]]) -> dict[str, Any] | None: - best = None - for entry in series or []: - if not isinstance(entry, dict): - continue - node = entry.get("node") - value = entry.get("value") - try: - numeric = float(value) - except (TypeError, ValueError): - continue - if best is None or numeric > best["value"]: - best = {"node": node, "value": numeric} - return best - - -def build_summary(snapshot: dict[str, Any] | None) -> dict[str, Any]: - if not snapshot: - return {} - nodes_detail = _nodes_detail(snapshot) - metrics = _metrics(snapshot) - summary: dict[str, Any] = {} - - if isinstance(snapshot.get("nodes_summary"), dict): - summary["nodes_summary"] = snapshot.get("nodes_summary") - if metrics: - summary["metrics"] = metrics - if isinstance(snapshot.get("jobs"), dict): - summary["jobs"] = snapshot.get("jobs") - summary.update(_build_nodes(snapshot)) - summary.update(_build_pressure(snapshot)) - summary.update(_build_hardware(nodes_detail)) - summary.update(_build_hardware_by_node(nodes_detail)) - summary.update(_build_hardware_usage(metrics, summary.get("hardware_by_node"))) - summary.update(_build_node_facts(nodes_detail)) - summary.update(_build_node_ages(nodes_detail)) - summary.update(_build_node_taints(nodes_detail)) - summary.update(_build_capacity(metrics)) - summary.update(_build_pods(metrics)) - summary.update(_build_namespace_pods(snapshot)) - summary.update(_build_namespace_nodes(snapshot)) - summary.update(_build_node_pods(snapshot)) - summary.update(_build_node_pods_top(metrics)) - summary.update(_build_pod_issues(snapshot)) - summary.update(_build_workload_health(snapshot)) - summary.update(_build_events(snapshot)) - summary.update(_build_event_summary(snapshot)) - summary.update(_build_postgres(metrics)) - summary.update(_build_hottest(metrics)) - summary.update(_build_pvc(metrics)) - summary.update(_build_namespace_capacity(metrics)) - summary.update(_build_namespace_capacity_summary(metrics)) - summary.update(_build_longhorn(snapshot)) - summary.update(_build_root_disk_headroom(metrics)) - summary.update(_build_node_load(metrics)) - summary.update(_build_node_load_summary(metrics)) - summary.update(_build_cluster_watchlist(summary)) - summary.update(_build_workloads(snapshot)) - summary.update(_build_flux(snapshot)) - _merge_cluster_summary(snapshot, summary) - _augment_lexicon(summary) - return summary - - -def _merge_cluster_summary(snapshot: dict[str, Any], summary: dict[str, Any]) -> None: - cluster_summary = snapshot.get("summary") if isinstance(snapshot.get("summary"), dict) else {} - if not cluster_summary: - return - _merge_cluster_fields( - summary, - cluster_summary, - { - "signals": list, - "profiles": dict, - "inventory": dict, - "topology": dict, - "lexicon": dict, - "cross_stats": dict, - "baseline_deltas": dict, - "pod_issue_summary": dict, - "trend_requests": dict, - "pod_waiting_trends": dict, - "pod_terminated_trends": dict, - }, - ) - - -def _merge_cluster_fields( - summary: dict[str, Any], - cluster_summary: dict[str, Any], - field_types: dict[str, type], -) -> None: - for key, expected in field_types.items(): - value = cluster_summary.get(key) - if isinstance(value, expected): - summary[key] = value - - -def _augment_lexicon(summary: dict[str, Any]) -> None: - lexicon = summary.get("lexicon") - if not isinstance(lexicon, dict): - lexicon = {"terms": [], "aliases": {}} - terms = list(lexicon.get("terms") or []) - aliases = dict(lexicon.get("aliases") or {}) - hardware = summary.get("hardware") if isinstance(summary.get("hardware"), dict) else {} - hardware_map = { - "rpi5": "Raspberry Pi 5 nodes", - "rpi4": "Raspberry Pi 4 nodes", - "rpi": "Raspberry Pi nodes", - "jetson": "NVIDIA Jetson nodes", - "amd64": "AMD64 nodes", - } - existing_terms = {entry.get("term") for entry in terms if isinstance(entry, dict)} - for key, meaning in hardware_map.items(): - if key not in hardware: - continue - if key not in existing_terms: - terms.append({"term": key, "meaning": meaning}) - if key not in aliases: - aliases[key] = meaning - if "raspberry pi 5" not in aliases and "rpi5" in hardware: - aliases["raspberry pi 5"] = "rpi5" - if "raspberry pi 4" not in aliases and "rpi4" in hardware: - aliases["raspberry pi 4"] = "rpi4" - lexicon["terms"] = terms - lexicon["aliases"] = aliases - summary["lexicon"] = lexicon - - -def _nodes_detail(snapshot: dict[str, Any]) -> list[dict[str, Any]]: - items = snapshot.get("nodes_detail") - return items if isinstance(items, list) else [] - - -def _metrics(snapshot: dict[str, Any]) -> dict[str, Any]: - metrics = snapshot.get("metrics") - return metrics if isinstance(metrics, dict) else {} - - -def _build_nodes(snapshot: dict[str, Any]) -> dict[str, Any]: - nodes_summary = snapshot.get("nodes_summary") if isinstance(snapshot.get("nodes_summary"), dict) else {} - if not nodes_summary: - return {} - return { - "nodes": { - "total": nodes_summary.get("total"), - "ready": nodes_summary.get("ready"), - "not_ready": nodes_summary.get("not_ready"), - } - } - - -def _build_pressure(snapshot: dict[str, Any]) -> dict[str, Any]: - nodes_summary = snapshot.get("nodes_summary") if isinstance(snapshot.get("nodes_summary"), dict) else {} - pressure = nodes_summary.get("pressure_nodes") if isinstance(nodes_summary.get("pressure_nodes"), dict) else {} - if not pressure: - return {} - return {"pressure_nodes": pressure} - - -def _build_hardware(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: - hardware: dict[str, list[str]] = {} - for node in nodes_detail or []: - if not isinstance(node, dict): - continue - name = node.get("name") - hardware_class = node.get("hardware") or "unknown" - if name: - hardware.setdefault(hardware_class, []).append(name) - if not hardware: - return {} - return {"hardware": {key: sorted(value) for key, value in hardware.items()}} - - -def _build_hardware_by_node(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: - mapping: dict[str, str] = {} - for node in nodes_detail or []: - if not isinstance(node, dict): - continue - name = node.get("name") - if isinstance(name, str) and name: - hardware = node.get("hardware") or "unknown" - mapping[name] = str(hardware) - return {"hardware_by_node": mapping} if mapping else {} - - -def _build_hardware_usage( # noqa: C901 - metrics: dict[str, Any], - hardware_by_node: dict[str, Any] | None, -) -> dict[str, Any]: - if not isinstance(hardware_by_node, dict) or not hardware_by_node: - return {} - node_load = metrics.get("node_load") if isinstance(metrics.get("node_load"), list) else [] - if not node_load: - return {} - buckets: dict[str, dict[str, list[float]]] = {} - for entry in node_load: - if not isinstance(entry, dict): - continue - node = entry.get("node") - if not isinstance(node, str) or not node: - continue - hardware = hardware_by_node.get(node, "unknown") - bucket = buckets.setdefault(str(hardware), {"load_index": [], "cpu": [], "ram": [], "net": [], "io": []}) - for key in ("load_index", "cpu", "ram", "net", "io"): - value = entry.get(key) - if isinstance(value, (int, float)): - bucket[key].append(float(value)) - output: list[dict[str, Any]] = [] - for hardware, metrics_bucket in buckets.items(): - row: dict[str, Any] = {"hardware": hardware} - for key, values in metrics_bucket.items(): - if values: - row[key] = sum(values) / len(values) - output.append(row) - output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("hardware") or "")) - return {"hardware_usage_avg": output} - - -def _build_node_ages(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: - ages: list[dict[str, Any]] = [] - for node in nodes_detail or []: - if not isinstance(node, dict): - continue - name = node.get("name") - age = node.get("age_hours") - if name and isinstance(age, (int, float)): - ages.append({"name": name, "age_hours": age}) - ages.sort(key=lambda item: -(item.get("age_hours") or 0)) - return {"node_ages": ages[:5]} if ages else {} - - -def _count_values(nodes_detail: list[dict[str, Any]], key: str) -> dict[str, int]: - counts: dict[str, int] = {} - for node in nodes_detail or []: - if not isinstance(node, dict): - continue - value = node.get(key) - if isinstance(value, str) and value: - counts[value] = counts.get(value, 0) + 1 - return counts - - -def _build_node_facts(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: - if not nodes_detail: - return {} - role_counts: dict[str, int] = {} - for node in nodes_detail: - if not isinstance(node, dict): - continue - if node.get("is_worker"): - role_counts["worker"] = role_counts.get("worker", 0) + 1 - roles = node.get("roles") - if isinstance(roles, list): - for role in roles: - if isinstance(role, str) and role: - role_counts[role] = role_counts.get(role, 0) + 1 - return { - "node_arch_counts": _count_values(nodes_detail, "arch"), - "node_os_counts": _count_values(nodes_detail, "os"), - "node_kubelet_versions": _count_values(nodes_detail, "kubelet"), - "node_kernel_versions": _count_values(nodes_detail, "kernel"), - "node_runtime_versions": _count_values(nodes_detail, "container_runtime"), - "node_role_counts": role_counts, - } - - -def _build_node_taints(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: - taints: dict[str, list[str]] = {} - for node in nodes_detail or []: - if not isinstance(node, dict): - continue - name = node.get("name") - if not name: - continue - entries = node.get("taints") if isinstance(node.get("taints"), list) else [] - for entry in entries: - if not isinstance(entry, dict): - continue - key = entry.get("key") - effect = entry.get("effect") - if isinstance(key, str) and isinstance(effect, str): - label = f"{key}:{effect}" - taints.setdefault(label, []).append(name) - if not taints: - return {} - return {"node_taints": {key: sorted(names) for key, names in taints.items()}} - - -def _build_root_disk_headroom(metrics: dict[str, Any]) -> dict[str, Any]: - node_usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {} - disk = node_usage.get("disk") if isinstance(node_usage.get("disk"), list) else [] - if not disk: - return {} - entries = [] - for entry in disk: - if not isinstance(entry, dict): - continue - node = entry.get("node") - try: - used_pct = float(entry.get("value")) - except (TypeError, ValueError): - continue - headroom = max(0.0, 100.0 - used_pct) - if node: - entries.append({"node": node, "headroom_pct": headroom, "used_pct": used_pct}) - entries.sort(key=lambda item: (item.get("headroom_pct") or 0.0, item.get("node") or "")) - return {"root_disk_low_headroom": entries[:5]} if entries else {} - - -def _build_longhorn(snapshot: dict[str, Any]) -> dict[str, Any]: - longhorn = snapshot.get("longhorn") - return {"longhorn": longhorn} if isinstance(longhorn, dict) and longhorn else {} - - -def _build_node_load(metrics: dict[str, Any]) -> dict[str, Any]: - node_load = metrics.get("node_load") - if not isinstance(node_load, list) or not node_load: - return {} - return {"node_load": node_load} - - -def _build_pods(metrics: dict[str, Any]) -> dict[str, Any]: - pods = { - "running": metrics.get("pods_running"), - "pending": metrics.get("pods_pending"), - "failed": metrics.get("pods_failed"), - "succeeded": metrics.get("pods_succeeded"), - } - if not any(value is not None for value in pods.values()): - return {} - return {"pods": pods} - - -def _build_capacity(metrics: dict[str, Any]) -> dict[str, Any]: - if not metrics: - return {} - capacity = { - "cpu": metrics.get("capacity_cpu"), - "allocatable_cpu": metrics.get("allocatable_cpu"), - "mem_bytes": metrics.get("capacity_mem_bytes"), - "allocatable_mem_bytes": metrics.get("allocatable_mem_bytes"), - "pods": metrics.get("capacity_pods"), - "allocatable_pods": metrics.get("allocatable_pods"), - } - if not any(value is not None for value in capacity.values()): - return {} - return {"capacity": capacity} - - -def _build_namespace_pods(snapshot: dict[str, Any]) -> dict[str, Any]: - namespaces = snapshot.get("namespace_pods") - if not isinstance(namespaces, list) or not namespaces: - return {} - return {"namespace_pods": namespaces} - - -def _build_namespace_nodes(snapshot: dict[str, Any]) -> dict[str, Any]: - namespace_nodes = snapshot.get("namespace_nodes") - if not isinstance(namespace_nodes, list) or not namespace_nodes: - return {} - return {"namespace_nodes": namespace_nodes} - - -def _build_node_pods(snapshot: dict[str, Any]) -> dict[str, Any]: - node_pods = snapshot.get("node_pods") - if not isinstance(node_pods, list) or not node_pods: - return {} - return {"node_pods": node_pods} - - -def _build_node_pods_top(metrics: dict[str, Any]) -> dict[str, Any]: - top = metrics.get("node_pods_top") - if not isinstance(top, list) or not top: - return {} - return {"node_pods_top": top} - - -def _build_pod_issues(snapshot: dict[str, Any]) -> dict[str, Any]: - pod_issues = snapshot.get("pod_issues") - if not isinstance(pod_issues, dict) or not pod_issues: - return {} - return {"pod_issues": pod_issues} - - -def _build_workload_health(snapshot: dict[str, Any]) -> dict[str, Any]: - health = snapshot.get("workloads_health") - if not isinstance(health, dict) or not health: - return {} - deployments = health.get("deployments") - statefulsets = health.get("statefulsets") - daemonsets = health.get("daemonsets") - if not isinstance(deployments, dict) or not isinstance(statefulsets, dict) or not isinstance(daemonsets, dict): - return {} - return { - "workloads_health": { - "deployments": deployments, - "statefulsets": statefulsets, - "daemonsets": daemonsets, - } - } - - -def _build_events(snapshot: dict[str, Any]) -> dict[str, Any]: - events = snapshot.get("events") - if not isinstance(events, dict) or not events: - return {} - return {"events": events} - - -def _build_event_summary(snapshot: dict[str, Any]) -> dict[str, Any]: - events = snapshot.get("events") - if not isinstance(events, dict) or not events: - return {} - summary = {} - if isinstance(events.get("warnings_top_reason"), dict): - summary["warnings_top_reason"] = events.get("warnings_top_reason") - if events.get("warnings_latest"): - summary["warnings_latest"] = events.get("warnings_latest") - return {"event_summary": summary} if summary else {} - - -def _build_postgres(metrics: dict[str, Any]) -> dict[str, Any]: - postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} - if not postgres: - return {} - return { - "postgres": { - "used": postgres.get("used"), - "max": postgres.get("max"), - "hottest_db": postgres.get("hottest_db"), - "by_db": postgres.get("by_db"), - } - } - - -def _build_hottest(metrics: dict[str, Any]) -> dict[str, Any]: - node_usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {} - hottest: dict[str, Any] = {} - for key in ("cpu", "ram", "net", "io", "disk"): - top = _node_usage_top(node_usage.get(key, [])) - if top: - hottest[key] = top - if not hottest: - return {} - return {"hottest": hottest} - - -def _build_pvc(metrics: dict[str, Any]) -> dict[str, Any]: - pvc_usage = metrics.get("pvc_usage_top") if isinstance(metrics.get("pvc_usage_top"), list) else [] - if not pvc_usage: - return {} - return {"pvc_usage_top": pvc_usage} - - -def _build_namespace_capacity(metrics: dict[str, Any]) -> dict[str, Any]: - capacity = metrics.get("namespace_capacity") - if not isinstance(capacity, list) or not capacity: - return {} - return {"namespace_capacity": capacity} - - -def _build_namespace_capacity_summary(metrics: dict[str, Any]) -> dict[str, Any]: - summary = metrics.get("namespace_capacity_summary") - if not isinstance(summary, dict) or not summary: - return {} - return {"namespace_capacity_summary": summary} - - -def _build_node_load_summary(metrics: dict[str, Any]) -> dict[str, Any]: - summary = metrics.get("node_load_summary") - if not isinstance(summary, dict) or not summary: - return {} - return {"node_load_summary": summary} - - -def _build_workloads(snapshot: dict[str, Any]) -> dict[str, Any]: - workloads = snapshot.get("workloads") if isinstance(snapshot.get("workloads"), list) else [] - return {"workloads": workloads} - - -def _build_flux(snapshot: dict[str, Any]) -> dict[str, Any]: - flux = snapshot.get("flux") if isinstance(snapshot.get("flux"), dict) else {} - return {"flux": flux} - - -def _format_float(value: Any) -> str: - try: - numeric = float(value) - except (TypeError, ValueError): - return str(value) - return f"{numeric:.2f}".rstrip("0").rstrip(".") - - -def _format_rate_bytes(value: Any) -> str: - try: - numeric = float(value) - except (TypeError, ValueError): - return str(value) - if numeric >= _BYTES_MB: - return f"{numeric / _BYTES_MB:.2f} MB/s" - if numeric >= _BYTES_KB: - return f"{numeric / _BYTES_KB:.2f} KB/s" - return f"{numeric:.2f} B/s" - - -def _format_bytes(value: Any) -> str: - try: - numeric = float(value) - except (TypeError, ValueError): - return str(value) - if numeric >= _BYTES_GB: - return f"{numeric / _BYTES_GB:.2f} GB" - if numeric >= _BYTES_MB: - return f"{numeric / _BYTES_MB:.2f} MB" - if numeric >= _BYTES_KB: - return f"{numeric / _BYTES_KB:.2f} KB" - return f"{numeric:.2f} B" - - -def _format_kv_map(values: dict[str, Any]) -> str: - parts = [] - for key, value in values.items(): - parts.append(f"{key}={value}") - return ", ".join(parts) - - -def _format_names(names: list[str]) -> str: - if not names: - return "" - return ", ".join(sorted(names)) - - -def _append_nodes(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901 - nodes = summary.get("nodes") if isinstance(summary.get("nodes"), dict) else {} - if not nodes: - return - workers = {} - if isinstance(summary.get("nodes_summary"), dict): - workers = summary["nodes_summary"].get("workers") or {} - workers_total = workers.get("total") - workers_ready = workers.get("ready") - workers_str = "" - if workers_total is not None and workers_ready is not None: - workers_str = f", workers_ready={workers_ready}/{workers_total}" - total = nodes.get("total") - ready = nodes.get("ready") - not_ready = nodes.get("not_ready") - if not_ready is None: - not_ready = 0 - lines.append( - "nodes: total={total}, ready={ready}, not_ready={not_ready}{workers}".format( - total=total, - ready=ready, - not_ready=not_ready, - workers=workers_str, - ) - ) - if total is not None: - lines.append(f"nodes_total: {total}") - if ready is not None: - lines.append(f"nodes_ready: {ready}") - if not_ready is not None: - lines.append(f"nodes_not_ready_count: {not_ready}") - if not isinstance(summary.get("nodes_summary"), dict): - return - not_ready_names = summary["nodes_summary"].get("not_ready_names") or [] - if not_ready_names: - lines.append("nodes_not_ready: " + _format_names(not_ready_names)) - by_arch = summary["nodes_summary"].get("by_arch") or {} - if isinstance(by_arch, dict) and by_arch: - lines.append("archs: " + _format_kv_map(by_arch)) - by_role = summary["nodes_summary"].get("by_role") or {} - if isinstance(by_role, dict) and by_role: - lines.append("roles: " + _format_kv_map(by_role)) - - -def _append_hardware(lines: list[str], summary: dict[str, Any]) -> None: - hardware = summary.get("hardware") if isinstance(summary.get("hardware"), dict) else {} - if not hardware: - return - parts = [] - for key, names in hardware.items(): - if not isinstance(names, list): - continue - label = f"{key}={len(names)}" - name_list = _format_names([str(name) for name in names if name]) - if name_list: - label = f"{label} ({name_list})" - parts.append(label) - if parts: - lines.append("hardware: " + "; ".join(sorted(parts))) - - -def _append_hardware_groups(lines: list[str], summary: dict[str, Any]) -> None: - hardware = summary.get("hardware") if isinstance(summary.get("hardware"), dict) else {} - if not hardware: - return - parts = [] - for key, names in hardware.items(): - if not isinstance(names, list): - continue - name_list = _format_names([str(name) for name in names if name]) - if name_list: - parts.append(f"{key}={name_list}") - if parts: - lines.append("hardware_nodes: " + "; ".join(sorted(parts))) - - -def _append_node_ages(lines: list[str], summary: dict[str, Any]) -> None: - ages = summary.get("node_ages") if isinstance(summary.get("node_ages"), list) else [] - if not ages: - return - parts = [] - for entry in ages[:3]: - if not isinstance(entry, dict): - continue - name = entry.get("name") - age = entry.get("age_hours") - if name and isinstance(age, (int, float)): - parts.append(f"{name}={_format_float(age)}h") - if parts: - lines.append("node_age_top: " + "; ".join(parts)) - - -def _append_node_taints(lines: list[str], summary: dict[str, Any]) -> None: - taints = summary.get("node_taints") if isinstance(summary.get("node_taints"), dict) else {} - if not taints: - return - parts = [] - for key, names in taints.items(): - if not isinstance(names, list): - continue - name_list = _format_names([str(name) for name in names if name]) - parts.append(f"{key}={len(names)} ({name_list})" if name_list else f"{key}={len(names)}") - if parts: - lines.append("node_taints: " + "; ".join(sorted(parts))) - - -def _append_node_facts(lines: list[str], summary: dict[str, Any]) -> None: - def top_counts(label: str, counts: dict[str, int], limit: int = 4) -> None: - if not counts: - return - top = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:limit] - rendered = "; ".join([f"{name}={count}" for name, count in top]) - if rendered: - lines.append(f"{label}: {rendered}") - - top_counts("node_arch", summary.get("node_arch_counts") or {}) - top_counts("node_os", summary.get("node_os_counts") or {}) - top_counts("node_kubelet_versions", summary.get("node_kubelet_versions") or {}) - top_counts("node_kernel_versions", summary.get("node_kernel_versions") or {}) - top_counts("node_runtime_versions", summary.get("node_runtime_versions") or {}) - top_counts("node_roles", summary.get("node_role_counts") or {}) - - -def _append_pressure(lines: list[str], summary: dict[str, Any]) -> None: - pressure = summary.get("pressure_nodes") - if not isinstance(pressure, dict) or not pressure: - return - parts = [] - for cond, nodes in sorted(pressure.items()): - if not nodes: - continue - name_list = _format_names([str(name) for name in nodes if name]) - parts.append(f"{cond}={len(nodes)} ({name_list})" if name_list else f"{cond}={len(nodes)}") - if parts: - lines.append("node_pressure: " + "; ".join(parts)) - - -def _append_pods(lines: list[str], summary: dict[str, Any]) -> None: - pods = summary.get("pods") if isinstance(summary.get("pods"), dict) else {} - if not pods: - return - lines.append( - "pods: running={running}, pending={pending}, failed={failed}, succeeded={succeeded}".format( - running=pods.get("running"), - pending=pods.get("pending"), - failed=pods.get("failed"), - succeeded=pods.get("succeeded"), - ) - ) - - -def _append_capacity(lines: list[str], summary: dict[str, Any]) -> None: - capacity = summary.get("capacity") if isinstance(summary.get("capacity"), dict) else {} - if not capacity: - return - parts = [] - if capacity.get("cpu") is not None: - parts.append(f"cpu={_format_float(capacity.get('cpu'))}") - if capacity.get("allocatable_cpu") is not None: - parts.append(f"alloc_cpu={_format_float(capacity.get('allocatable_cpu'))}") - if capacity.get("mem_bytes") is not None: - parts.append(f"mem={_format_bytes(capacity.get('mem_bytes'))}") - if capacity.get("allocatable_mem_bytes") is not None: - parts.append(f"alloc_mem={_format_bytes(capacity.get('allocatable_mem_bytes'))}") - if capacity.get("pods") is not None: - parts.append(f"pods={_format_float(capacity.get('pods'))}") - if capacity.get("allocatable_pods") is not None: - parts.append(f"alloc_pods={_format_float(capacity.get('allocatable_pods'))}") - if parts: - lines.append("capacity: " + "; ".join(parts)) - - -def _append_namespace_pods(lines: list[str], summary: dict[str, Any]) -> None: - namespaces = summary.get("namespace_pods") - if not isinstance(namespaces, list) or not namespaces: - return - top = sorted( - (item for item in namespaces if isinstance(item, dict)), - key=lambda item: (-int(item.get("pods_total") or 0), item.get("namespace") or ""), - )[:8] - parts = [] - for item in top: - name = item.get("namespace") - total = item.get("pods_total") - running = item.get("pods_running") - if not name: - continue - label = f"{name}={total}" - if running is not None: - label = f"{label} (running={running})" - parts.append(label) - if parts: - lines.append("namespaces_top: " + "; ".join(parts)) - - -def _append_namespace_nodes(lines: list[str], summary: dict[str, Any]) -> None: - namespace_nodes = summary.get("namespace_nodes") - if not isinstance(namespace_nodes, list) or not namespace_nodes: - return - top = sorted( - (item for item in namespace_nodes if isinstance(item, dict)), - key=lambda item: (-int(item.get("pods_total") or 0), item.get("namespace") or ""), - )[:8] - parts = [] - for item in top: - namespace = item.get("namespace") - pods_total = item.get("pods_total") - primary = item.get("primary_node") - if namespace: - label = f"{namespace}={pods_total}" - if primary: - label = f"{label} (primary={primary})" - parts.append(label) - if parts: - lines.append("namespace_nodes_top: " + "; ".join(parts)) - - -def _append_node_pods(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901, PLR0912 - node_pods = summary.get("node_pods") - if not isinstance(node_pods, list) or not node_pods: - return - top = sorted( - (item for item in node_pods if isinstance(item, dict)), - key=lambda item: (-int(item.get("pods_total") or 0), item.get("node") or ""), - )[:8] - max_entry = None - for entry in node_pods: - if not isinstance(entry, dict): - continue - pods_total = entry.get("pods_total") - try: - pods_value = int(pods_total) - except (TypeError, ValueError): - continue - if max_entry is None or pods_value > max_entry["pods_total"]: - max_entry = { - "node": entry.get("node"), - "pods_total": pods_value, - "namespaces_top": entry.get("namespaces_top") or [], - } - parts = [] - for item in top: - node = item.get("node") - pods_total = item.get("pods_total") - namespaces = item.get("namespaces_top") or [] - ns_label = "" - if namespaces: - ns_label = ", ".join([f"{name}={count}" for name, count in namespaces]) - if node: - label = f"{node}={pods_total}" - if ns_label: - label = f"{label} ({ns_label})" - parts.append(label) - if parts: - lines.append("node_pods_top: " + "; ".join(parts)) - if max_entry and isinstance(max_entry.get("node"), str): - ns_label = "" - namespaces = max_entry.get("namespaces_top") or [] - if namespaces: - ns_label = ", ".join([f"{name}={count}" for name, count in namespaces]) - label = f"{max_entry.get('node')}={max_entry.get('pods_total')}" - if ns_label: - label = f"{label} ({ns_label})" - lines.append("node_pods_max: " + label) - for item in top: - node = item.get("node") - namespaces = item.get("namespaces_top") or [] - if not node or not namespaces: - continue - ns_label = ", ".join([f"{name}={count}" for name, count in namespaces]) - lines.append(f"node_namespaces_top: {node} ({ns_label})") - - -def _append_pod_issues(lines: list[str], summary: dict[str, Any]) -> None: - pod_issues = summary.get("pod_issues") if isinstance(summary.get("pod_issues"), dict) else {} - if not pod_issues: - return - counts_line = _format_pod_issue_counts(pod_issues) - if counts_line: - lines.append(counts_line) - top_line = _format_pod_issue_top(pod_issues) - if top_line: - lines.append(top_line) - pending_line = _format_pod_pending_oldest(pod_issues) - if pending_line: - lines.append(pending_line) - pending_over_line = _format_pod_pending_over_15m(pod_issues) - if pending_over_line: - lines.append(pending_over_line) - reasons_line = _format_pod_waiting_reasons(pod_issues) - if reasons_line: - lines.append(reasons_line) - - -def _format_pod_issue_counts(pod_issues: dict[str, Any]) -> str: - counts = pod_issues.get("counts") if isinstance(pod_issues.get("counts"), dict) else {} - if not counts: - return "" - parts = [] - for key in ("Failed", "Pending", "Unknown"): - if key in counts: - parts.append(f"{key}={counts.get(key)}") - return "pod_issues: " + "; ".join(parts) if parts else "" - - -def _format_pod_issue_top(pod_issues: dict[str, Any]) -> str: - items = pod_issues.get("items") if isinstance(pod_issues.get("items"), list) else [] - if not items: - return "" - top = [] - for item in items[:5]: - if not isinstance(item, dict): - continue - namespace = item.get("namespace") - pod = item.get("pod") - if not namespace or not pod: - continue - phase = item.get("phase") or "" - restarts = item.get("restarts") or 0 - top.append(f"{namespace}/{pod}({phase},r={restarts})") - return "pod_issues_top: " + "; ".join(top) if top else "" - - -def _format_pod_pending_oldest(pod_issues: dict[str, Any]) -> str: - pending = pod_issues.get("pending_oldest") if isinstance(pod_issues.get("pending_oldest"), list) else [] - if not pending: - return "" - parts = [] - for item in pending[:5]: - if not isinstance(item, dict): - continue - namespace = item.get("namespace") - pod = item.get("pod") - age = item.get("age_hours") - reason = item.get("reason") or "" - if namespace and pod and age is not None: - label = f"{namespace}/{pod}={_format_float(age)}h" - if reason: - label = f"{label} ({reason})" - parts.append(label) - return "pods_pending_oldest: " + "; ".join(parts) if parts else "" - - -def _format_pod_waiting_reasons(pod_issues: dict[str, Any]) -> str: - reasons = pod_issues.get("waiting_reasons") if isinstance(pod_issues.get("waiting_reasons"), dict) else {} - if not reasons: - return "" - pairs = sorted(reasons.items(), key=lambda item: (-item[1], item[0]))[:5] - return "pod_waiting_reasons: " + "; ".join([f"{key}={val}" for key, val in pairs]) - - -def _format_pod_pending_over_15m(pod_issues: dict[str, Any]) -> str: - count = pod_issues.get("pending_over_15m") - if count is None: - return "" - try: - count_val = int(count) - except (TypeError, ValueError): - return "" - return f"pods_pending_over_15m: {count_val}" - - -def _append_workload_health(lines: list[str], summary: dict[str, Any]) -> None: - health = summary.get("workloads_health") if isinstance(summary.get("workloads_health"), dict) else {} - if not health: - return - deployments = health.get("deployments") if isinstance(health.get("deployments"), dict) else {} - statefulsets = health.get("statefulsets") if isinstance(health.get("statefulsets"), dict) else {} - daemonsets = health.get("daemonsets") if isinstance(health.get("daemonsets"), dict) else {} - total_not_ready = 0 - for entry in (deployments, statefulsets, daemonsets): - total_not_ready += int(entry.get("not_ready") or 0) - lines.append( - "workloads_not_ready: " - f"deployments={deployments.get('not_ready', 0)}, " - f"statefulsets={statefulsets.get('not_ready', 0)}, " - f"daemonsets={daemonsets.get('not_ready', 0)} " - f"(total={total_not_ready})" - ) - - -def _append_node_usage_stats(lines: list[str], summary: dict[str, Any]) -> None: - metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} - stats = metrics.get("node_usage_stats") if isinstance(metrics.get("node_usage_stats"), dict) else {} - if not stats: - return - parts = [] - for key in ("cpu", "ram", "net", "io", "disk"): - entry = stats.get(key) if isinstance(stats.get(key), dict) else {} - avg = entry.get("avg") - if avg is None: - continue - if key in {"net", "io"}: - value = _format_rate_bytes(avg) - else: - value = _format_float(avg) - parts.append(f"{key}={value}") - if parts: - lines.append("node_usage_avg: " + "; ".join(parts)) - - -def _append_events(lines: list[str], summary: dict[str, Any]) -> None: - events = summary.get("events") if isinstance(summary.get("events"), dict) else {} - if not events: - return - total = events.get("warnings_total") - by_reason = events.get("warnings_by_reason") if isinstance(events.get("warnings_by_reason"), dict) else {} - if total is None: - return - if by_reason: - top = sorted(by_reason.items(), key=lambda item: (-item[1], item[0]))[:3] - reasons = "; ".join([f"{reason}={count}" for reason, count in top]) - lines.append(f"warnings: total={total}; top={reasons}") - else: - lines.append(f"warnings: total={total}") - - -def _append_pvc_usage(lines: list[str], summary: dict[str, Any]) -> None: - pvc_usage = summary.get("pvc_usage_top") - if not isinstance(pvc_usage, list) or not pvc_usage: - return - parts = [] - for entry in pvc_usage: - metric = entry.get("metric") if isinstance(entry, dict) else {} - namespace = metric.get("namespace") - pvc = metric.get("persistentvolumeclaim") - value = entry.get("value") - if namespace and pvc: - parts.append(f"{namespace}/{pvc}={_format_float(value)}%") - if parts: - lines.append("pvc_usage_top: " + "; ".join(parts)) - - -def _append_root_disk_headroom(lines: list[str], summary: dict[str, Any]) -> None: - headroom = summary.get("root_disk_low_headroom") - if not isinstance(headroom, list) or not headroom: - return - parts = [] - for entry in headroom: - if not isinstance(entry, dict): - continue - node = entry.get("node") - headroom_pct = entry.get("headroom_pct") - if node and headroom_pct is not None: - parts.append(f"{node}={_format_float(headroom_pct)}%") - if parts: - lines.append("root_disk_low_headroom: " + "; ".join(parts)) - - -def _append_longhorn(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901 - longhorn = summary.get("longhorn") if isinstance(summary.get("longhorn"), dict) else {} - if not longhorn: - return - total = longhorn.get("total") - attached = longhorn.get("attached_count") - detached = longhorn.get("detached_count") - degraded = longhorn.get("degraded_count") - by_state = longhorn.get("by_state") if isinstance(longhorn.get("by_state"), dict) else {} - by_robust = longhorn.get("by_robustness") if isinstance(longhorn.get("by_robustness"), dict) else {} - if total is not None: - if attached is None and detached is None and degraded is None: - unhealthy = longhorn.get("unhealthy_count") - lines.append( - "longhorn: total={total}, unhealthy={unhealthy}".format( - total=total, - unhealthy=unhealthy if unhealthy is not None else 0, - ) - ) - else: - lines.append( - "longhorn: total={total}, attached={attached}, detached={detached}, degraded={degraded}".format( - total=total, - attached=attached if attached is not None else 0, - detached=detached if detached is not None else 0, - degraded=degraded if degraded is not None else 0, - ) - ) - if by_state: - lines.append("longhorn_state: " + _format_kv_map(by_state)) - if by_robust: - lines.append("longhorn_robustness: " + _format_kv_map(by_robust)) - unhealthy_items = longhorn.get("unhealthy") - if isinstance(unhealthy_items, list) and unhealthy_items: - parts = [] - for entry in unhealthy_items[:5]: - if not isinstance(entry, dict): - continue - name = entry.get("name") - state = entry.get("state") - robustness = entry.get("robustness") - if name: - label = name - if state or robustness: - label = f"{label}({state},{robustness})" - parts.append(label) - if parts: - lines.append("longhorn_unhealthy_top: " + "; ".join(parts)) - - -def _append_namespace_usage(lines: list[str], summary: dict[str, Any]) -> None: - metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} - cpu_top = metrics.get("namespace_cpu_top") if isinstance(metrics.get("namespace_cpu_top"), list) else [] - mem_top = metrics.get("namespace_mem_top") if isinstance(metrics.get("namespace_mem_top"), list) else [] - if cpu_top: - parts = [] - for entry in cpu_top: - metric = entry.get("metric") if isinstance(entry, dict) else {} - namespace = metric.get("namespace") - value = entry.get("value") - if namespace: - parts.append(f"{namespace}={_format_float(value)}") - if parts: - lines.append("namespace_cpu_top: " + "; ".join(parts)) - if mem_top: - parts = [] - for entry in mem_top: - metric = entry.get("metric") if isinstance(entry, dict) else {} - namespace = metric.get("namespace") - value = entry.get("value") - if namespace: - parts.append(f"{namespace}={_format_bytes(value)}") - if parts: - lines.append("namespace_mem_top: " + "; ".join(parts)) - - -def _append_namespace_requests(lines: list[str], summary: dict[str, Any]) -> None: - metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} - cpu_req = metrics.get("namespace_cpu_requests_top") if isinstance(metrics.get("namespace_cpu_requests_top"), list) else [] - mem_req = metrics.get("namespace_mem_requests_top") if isinstance(metrics.get("namespace_mem_requests_top"), list) else [] - if cpu_req: - parts = [] - for entry in cpu_req: - metric = entry.get("metric") if isinstance(entry, dict) else {} - namespace = metric.get("namespace") - value = entry.get("value") - if namespace: - parts.append(f"{namespace}={_format_float(value)}") - if parts: - lines.append("namespace_cpu_requests_top: " + "; ".join(parts)) - if mem_req: - parts = [] - for entry in mem_req: - metric = entry.get("metric") if isinstance(entry, dict) else {} - namespace = metric.get("namespace") - value = entry.get("value") - if namespace: - parts.append(f"{namespace}={_format_bytes(value)}") - if parts: - lines.append("namespace_mem_requests_top: " + "; ".join(parts)) - - -def _append_namespace_io_net(lines: list[str], summary: dict[str, Any]) -> None: - metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} - net_top = metrics.get("namespace_net_top") if isinstance(metrics.get("namespace_net_top"), list) else [] - io_top = metrics.get("namespace_io_top") if isinstance(metrics.get("namespace_io_top"), list) else [] - if net_top: - parts = [] - for entry in net_top: - metric = entry.get("metric") if isinstance(entry, dict) else {} - namespace = metric.get("namespace") - value = entry.get("value") - if namespace: - parts.append(f"{namespace}={_format_rate_bytes(value)}") - if parts: - lines.append("namespace_net_top: " + "; ".join(parts)) - if io_top: - parts = [] - for entry in io_top: - metric = entry.get("metric") if isinstance(entry, dict) else {} - namespace = metric.get("namespace") - value = entry.get("value") - if namespace: - parts.append(f"{namespace}={_format_rate_bytes(value)}") - if parts: - lines.append("namespace_io_top: " + "; ".join(parts)) - - -def _append_pod_usage(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901, PLR0912 - metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} - cpu_top = metrics.get("pod_cpu_top") if isinstance(metrics.get("pod_cpu_top"), list) else [] - cpu_top_node = ( - metrics.get("pod_cpu_top_node") - if isinstance(metrics.get("pod_cpu_top_node"), list) - else [] - ) - mem_top = metrics.get("pod_mem_top") if isinstance(metrics.get("pod_mem_top"), list) else [] - mem_top_node = ( - metrics.get("pod_mem_top_node") - if isinstance(metrics.get("pod_mem_top_node"), list) - else [] - ) - if cpu_top: - parts = [] - for entry in cpu_top: - metric = entry.get("metric") if isinstance(entry, dict) else {} - namespace = metric.get("namespace") - pod = metric.get("pod") - value = entry.get("value") - if namespace and pod and value is not None: - parts.append(f"{namespace}/{pod}={_format_float(value)}") - if parts: - lines.append("pod_cpu_top: " + "; ".join(parts)) - if cpu_top_node: - parts = [] - for entry in cpu_top_node: - metric = entry.get("metric") if isinstance(entry, dict) else {} - namespace = metric.get("namespace") - pod = metric.get("pod") - node = metric.get("node") - value = entry.get("value") - if namespace and pod and node and value is not None: - parts.append(f"{node}:{namespace}/{pod}={_format_float(value)}") - if parts: - lines.append("pod_cpu_top_node: " + "; ".join(parts)) - if mem_top: - parts = [] - for entry in mem_top: - metric = entry.get("metric") if isinstance(entry, dict) else {} - namespace = metric.get("namespace") - pod = metric.get("pod") - value = entry.get("value") - if namespace and pod and value is not None: - parts.append(f"{namespace}/{pod}={_format_bytes(value)}") - if parts: - lines.append("pod_mem_top: " + "; ".join(parts)) - if mem_top_node: - parts = [] - for entry in mem_top_node: - metric = entry.get("metric") if isinstance(entry, dict) else {} - namespace = metric.get("namespace") - pod = metric.get("pod") - node = metric.get("node") - value = entry.get("value") - if namespace and pod and node and value is not None: - parts.append(f"{node}:{namespace}/{pod}={_format_bytes(value)}") - if parts: - lines.append("pod_mem_top_node: " + "; ".join(parts)) - - -def _append_restarts(lines: list[str], summary: dict[str, Any]) -> None: - metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} - top_restarts = metrics.get("top_restarts_1h") or [] - if not isinstance(top_restarts, list) or not top_restarts: - top_restarts = [] - parts = [] - for entry in top_restarts: - metric = entry.get("metric") if isinstance(entry, dict) else {} - value = entry.get("value") if isinstance(entry, dict) else [] - if not isinstance(metric, dict) or not isinstance(value, list) or len(value) < _VALUE_PAIR_LEN: - continue - namespace = metric.get("namespace") - pod = metric.get("pod") - count = _format_float(value[1]) - if namespace and pod: - parts.append(f"{namespace}/{pod}={count}") - if parts: - lines.append("restarts_1h_top: " + "; ".join(parts)) - else: - lines.append("restarts_1h_top: none") - ns_top = metrics.get("restart_namespace_top") or [] - if isinstance(ns_top, list) and ns_top: - ns_parts = [] - for entry in ns_top: - metric = entry.get("metric") if isinstance(entry, dict) else {} - value = entry.get("value") - namespace = metric.get("namespace") if isinstance(metric, dict) else None - if namespace and value is not None: - ns_parts.append(f"{namespace}={_format_float(value)}") - if ns_parts: - lines.append("restarts_1h_namespace_top: " + "; ".join(ns_parts)) - else: - lines.append("restarts_1h_namespace_top: none") - - -def _append_job_failures(lines: list[str], summary: dict[str, Any]) -> None: - metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} - failures = metrics.get("job_failures_24h") if isinstance(metrics.get("job_failures_24h"), list) else [] - if not failures: - return - parts = [] - for entry in failures: - metric = entry.get("metric") if isinstance(entry, dict) else {} - namespace = metric.get("namespace") - job_name = metric.get("job_name") or metric.get("job") - value = entry.get("value") - if namespace and job_name and value is not None: - parts.append(f"{namespace}/{job_name}={_format_float(value)}") - if parts: - lines.append("job_failures_24h: " + "; ".join(parts)) - - -def _append_jobs(lines: list[str], summary: dict[str, Any]) -> None: - jobs = summary.get("jobs") if isinstance(summary.get("jobs"), dict) else {} - if not jobs: - return - totals_line = _format_jobs_totals(jobs) - if totals_line: - lines.append(totals_line) - failing_line = _format_jobs_failing(jobs) - if failing_line: - lines.append(failing_line) - active_line = _format_jobs_active_oldest(jobs) - if active_line: - lines.append(active_line) - - -def _format_jobs_totals(jobs: dict[str, Any]) -> str: - totals = jobs.get("totals") if isinstance(jobs.get("totals"), dict) else {} - if not totals: - return "" - return "jobs: total={total}, active={active}, failed={failed}, succeeded={succeeded}".format( - total=totals.get("total"), - active=totals.get("active"), - failed=totals.get("failed"), - succeeded=totals.get("succeeded"), - ) - - -def _format_jobs_failing(jobs: dict[str, Any]) -> str: - failing = jobs.get("failing") if isinstance(jobs.get("failing"), list) else [] - if not failing: - return "" - parts = [] - for item in failing[:5]: - if not isinstance(item, dict): - continue - namespace = item.get("namespace") - name = item.get("job") - failed = item.get("failed") - age = item.get("age_hours") - if namespace and name and failed is not None: - label = f"{namespace}/{name}={failed}" - if age is not None: - label = f"{label} ({_format_float(age)}h)" - parts.append(label) - return "jobs_failing_top: " + "; ".join(parts) if parts else "" - - -def _format_jobs_active_oldest(jobs: dict[str, Any]) -> str: - active_oldest = jobs.get("active_oldest") if isinstance(jobs.get("active_oldest"), list) else [] - if not active_oldest: - return "" - parts = [] - for item in active_oldest[:5]: - if not isinstance(item, dict): - continue - namespace = item.get("namespace") - name = item.get("job") - age = item.get("age_hours") - if namespace and name and age is not None: - parts.append(f"{namespace}/{name}={_format_float(age)}h") - return "jobs_active_oldest: " + "; ".join(parts) if parts else "" - - -def _append_postgres(lines: list[str], summary: dict[str, Any]) -> None: - postgres = summary.get("postgres") if isinstance(summary.get("postgres"), dict) else {} - if not postgres: - return - hottest = postgres.get("hottest_db") or "" - lines.append( - "postgres: used={used}, max={max}, hottest_db={hottest}".format( - used=postgres.get("used"), - max=postgres.get("max"), - hottest=hottest, - ) - ) - used = postgres.get("used") - max_conn = postgres.get("max") - if used is not None or max_conn is not None: - lines.append( - "postgres_connections_total: used={used}, max={max}".format( - used=_format_float(used), - max=_format_float(max_conn), - ) - ) - by_db = postgres.get("by_db") - if isinstance(by_db, list) and by_db: - parts = [] - for entry in by_db: - metric = entry.get("metric") if isinstance(entry, dict) else {} - value = entry.get("value") - if isinstance(value, list) and len(value) >= _VALUE_PAIR_LEN: - value = value[1] - name = metric.get("datname") if isinstance(metric, dict) else None - if name and value is not None: - parts.append(f"{name}={_format_float(value)}") - if parts: - lines.append("postgres_connections_by_db: " + "; ".join(parts)) - - -def _append_hottest(lines: list[str], summary: dict[str, Any]) -> None: - hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {} - if not hottest: - return - hardware_map = summary.get("hardware_by_node") - if not isinstance(hardware_map, dict): - hardware_map = {} - parts = [] - for key, entry in hottest.items(): - if not isinstance(entry, dict): - continue - node = entry.get("node") - hardware = hardware_map.get(node) if node else None - if key in {"net", "io"}: - value = _format_rate_bytes(entry.get("value")) - else: - value = _format_float(entry.get("value")) - if value and key in {"cpu", "ram", "disk"}: - value = f"{value}%" - if node: - label = node - if hardware: - label = f"{label} [{hardware}]" - parts.append(f"{key}={label} ({value})") - if parts: - lines.append("hottest: " + "; ".join(parts)) - - -def _append_workloads(lines: list[str], summary: dict[str, Any]) -> None: - workloads = summary.get("workloads") - if not isinstance(workloads, list) or not workloads: - return - lines.append(f"workloads: total={len(workloads)}") - top_workloads = sorted( - (item for item in workloads if isinstance(item, dict)), - key=lambda item: (-int(item.get("pods_total") or 0), item.get("workload") or ""), - )[:5] - if not top_workloads: - return - parts = [] - for item in top_workloads: - namespace = item.get("namespace") - name = item.get("workload") - pods_total = item.get("pods_total") - primary = item.get("primary_node") - if namespace and name: - label = f"{namespace}/{name}={pods_total}" - if primary: - label = f"{label} (primary={primary})" - parts.append(label) - if parts: - lines.append("workloads_top: " + "; ".join(parts)) - - -def _append_topology(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901, PLR0912 - topology = summary.get("topology") if isinstance(summary.get("topology"), dict) else {} - if not topology: - return - nodes = topology.get("nodes") if isinstance(topology.get("nodes"), list) else [] - workloads = topology.get("workloads") if isinstance(topology.get("workloads"), list) else [] - if nodes: - parts = [] - for entry in nodes[:5]: - if not isinstance(entry, dict): - continue - node = entry.get("node") - top = entry.get("workloads_top") if isinstance(entry.get("workloads_top"), list) else [] - if not node or not top: - continue - items = ", ".join([f"{name}({count})" for name, count in top if name and count is not None]) - if items: - parts.append(f"{node}={items}") - if parts: - lines.append("node_workloads_top: " + "; ".join(parts)) - if workloads: - parts = [] - for entry in workloads[:5]: - if not isinstance(entry, dict): - continue - namespace = entry.get("namespace") - name = entry.get("workload") - nodes_top = entry.get("nodes_top") if isinstance(entry.get("nodes_top"), list) else [] - if not namespace or not name: - continue - nodes_label = ", ".join([f"{node}:{count}" for node, count in nodes_top if node]) - label = f"{namespace}/{name}" - if nodes_label: - label = f"{label} [{nodes_label}]" - parts.append(label) - if parts: - lines.append("workload_nodes_top: " + "; ".join(parts)) - - -def _append_flux(lines: list[str], summary: dict[str, Any]) -> None: - flux = summary.get("flux") if isinstance(summary.get("flux"), dict) else {} - if not flux: - return - not_ready = flux.get("not_ready") - if not_ready is not None: - lines.append(f"flux_not_ready: {not_ready}") - items = flux.get("items") - if isinstance(items, list) and items: - parts = [] - for item in items[:10]: - if not isinstance(item, dict): - continue - name = item.get("name") or "" - namespace = item.get("namespace") or "" - reason = item.get("reason") or "" - suspended = item.get("suspended") - label = f"{namespace}/{name}".strip("/") - if reason: - label = f"{label} ({reason})" - if suspended: - label = f"{label} [suspended]" - if label: - parts.append(label) - if parts: - lines.append("flux_not_ready_items: " + "; ".join(parts)) - - -def _append_signals(lines: list[str], summary: dict[str, Any]) -> None: - signals = summary.get("signals") if isinstance(summary.get("signals"), list) else [] - if not signals: - return - lines.append("signals:") - for entry in signals[:8]: - if not isinstance(entry, dict): - continue - scope = entry.get("scope") or "" - target = entry.get("target") or "" - metric = entry.get("metric") or "" - current = entry.get("current") - delta = entry.get("delta_pct") - severity = entry.get("severity") or "" - detail = f"{scope}:{target} {metric}={current}" - if delta is not None: - detail += f" delta={delta}%" - if severity: - detail += f" severity={severity}" - lines.append(f"- {detail}") - - -def _append_profiles(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901 - profiles = summary.get("profiles") if isinstance(summary.get("profiles"), dict) else {} - if not profiles: - return - nodes = profiles.get("nodes") if isinstance(profiles.get("nodes"), list) else [] - namespaces = profiles.get("namespaces") if isinstance(profiles.get("namespaces"), list) else [] - workloads = profiles.get("workloads") if isinstance(profiles.get("workloads"), list) else [] - if nodes: - lines.append("node_profiles:") - for entry in nodes[:3]: - if not isinstance(entry, dict): - continue - lines.append( - f"- {entry.get('node')}: load={entry.get('load_index')} cpu={entry.get('cpu')} ram={entry.get('ram')} " - f"pods={entry.get('pods_total')} hw={entry.get('hardware')}" - ) - if namespaces: - lines.append("namespace_profiles:") - for entry in namespaces[:3]: - if not isinstance(entry, dict): - continue - lines.append( - f"- {entry.get('namespace')}: pods={entry.get('pods_total')} cpu={entry.get('cpu_usage')} " - f"mem={entry.get('mem_usage')} primary={entry.get('primary_node')}" - ) - if workloads: - lines.append("workload_profiles:") - for entry in workloads[:3]: - if not isinstance(entry, dict): - continue - lines.append( - f"- {entry.get('namespace')}/{entry.get('workload')}: pods={entry.get('pods_total')} " - f"running={entry.get('pods_running')} node={entry.get('primary_node')}" - ) - - -def _append_units_windows(lines: list[str], summary: dict[str, Any]) -> None: - metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} - units = metrics.get("units") if isinstance(metrics.get("units"), dict) else {} - windows = metrics.get("windows") if isinstance(metrics.get("windows"), dict) else {} - if units: - lines.append("units: " + _format_kv_map(units)) - else: - lines.append("units: cpu_pct, ram_pct, net=bytes_per_sec, io=bytes_per_sec") - if windows: - lines.append("windows: " + _format_kv_map(windows)) - else: - lines.append("windows: rates=5m, restarts=1h") - - -def _append_node_load_summary(lines: list[str], summary: dict[str, Any]) -> None: - node_load = summary.get("node_load_summary") - if not isinstance(node_load, dict) or not node_load: - return - hardware_by_node = summary.get("hardware_by_node") - hardware_by_node = hardware_by_node if isinstance(hardware_by_node, dict) else {} - top = node_load.get("top") - if isinstance(top, list) and top: - parts = [] - for entry in top[:5]: - if not isinstance(entry, dict): - continue - node = entry.get("node") or "" - load = entry.get("load_index") - cpu = entry.get("cpu") - ram = entry.get("ram") - io = entry.get("io") - net = entry.get("net") - pods_total = entry.get("pods_total") - label = f"{node} idx={_format_float(load)}" - if node and node in hardware_by_node: - label += f" hw={hardware_by_node.get(node)}" - if isinstance(pods_total, (int, float)): - label += f" pods={int(pods_total)}" - label += f" cpu={_format_float(cpu)} ram={_format_float(ram)}" - label += f" io={_format_rate_bytes(io)} net={_format_rate_bytes(net)}" - parts.append(label) - if parts: - lines.append("node_load_top: " + "; ".join(parts)) - outliers = node_load.get("outliers") - if isinstance(outliers, list) and outliers: - names = [entry.get("node") for entry in outliers if isinstance(entry, dict)] - names = [name for name in names if isinstance(name, str) and name] - if names: - lines.append("node_load_outliers: " + _format_names(names)) - - -def _append_hardware_usage(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901, PLR0912 - usage = summary.get("hardware_usage_avg") - if not isinstance(usage, list) or not usage: - return - parts = [] - tops: dict[str, tuple[str, float]] = {} - for entry in usage[:5]: - if not isinstance(entry, dict): - continue - hardware = entry.get("hardware") - load = entry.get("load_index") - cpu = entry.get("cpu") - ram = entry.get("ram") - io = entry.get("io") - net = entry.get("net") - if not hardware: - continue - label = f"{hardware} idx={_format_float(load)}" - label += f" cpu={_format_float(cpu)} ram={_format_float(ram)}" - label += f" io={_format_rate_bytes(io)} net={_format_rate_bytes(net)}" - parts.append(label) - for metric, value in (("cpu", cpu), ("ram", ram), ("io", io), ("net", net), ("load", load)): - if isinstance(value, (int, float)): - current = tops.get(metric) - if current is None or float(value) > current[1]: - tops[metric] = (hardware, float(value)) - if parts: - lines.append("hardware_usage_avg: " + "; ".join(parts)) - if tops: - top_parts = [] - for metric in ("cpu", "ram", "io", "net", "load"): - entry = tops.get(metric) - if not entry: - continue - hardware, value = entry - if metric in {"io", "net"}: - rendered = _format_rate_bytes(value) - else: - rendered = _format_float(value) - top_parts.append(f"{metric}={hardware} ({rendered})") - if top_parts: - lines.append("hardware_usage_top: " + "; ".join(top_parts)) - - -def _append_cluster_watchlist(lines: list[str], summary: dict[str, Any]) -> None: - watchlist = summary.get("cluster_watchlist") - if not isinstance(watchlist, list) or not watchlist: - return - lines.append("cluster_watchlist: " + "; ".join(watchlist)) - - -def _append_baseline_deltas(lines: list[str], summary: dict[str, Any]) -> None: - deltas = summary.get("baseline_deltas") if isinstance(summary.get("baseline_deltas"), dict) else {} - nodes = deltas.get("nodes") if isinstance(deltas.get("nodes"), dict) else {} - namespaces = deltas.get("namespaces") if isinstance(deltas.get("namespaces"), dict) else {} - for scope, block in (("nodes", nodes), ("namespaces", namespaces)): - if not isinstance(block, dict): - continue - for metric, entries in block.items(): - if not isinstance(entries, list) or not entries: - continue - parts: list[str] = [] - for entry in entries[:5]: - if not isinstance(entry, dict): - continue - name = entry.get("node") if scope == "nodes" else entry.get("namespace") - delta = entry.get("delta") - severity = entry.get("severity") - if not isinstance(name, str) or not name or not isinstance(delta, (int, float)): - continue - suffix = f" ({severity})" if isinstance(severity, str) and severity else "" - parts.append(f"{name}={_format_float(delta)}%{suffix}") - if parts: - lines.append(f"{scope}_baseline_delta_{metric}: " + "; ".join(parts)) - - -def _append_pod_issue_summary(lines: list[str], summary: dict[str, Any]) -> None: - issues = summary.get("pod_issue_summary") if isinstance(summary.get("pod_issue_summary"), dict) else {} - waiting = issues.get("waiting_reasons_top") if isinstance(issues.get("waiting_reasons_top"), list) else [] - phases = issues.get("phase_reasons_top") if isinstance(issues.get("phase_reasons_top"), list) else [] - namespace_issue = issues.get("namespace_issue_top") if isinstance(issues.get("namespace_issue_top"), dict) else {} - waiting_line = _reason_line(waiting, "pod_waiting_reasons_top") - if waiting_line: - lines.append(waiting_line) - phase_line = _reason_line(phases, "pod_phase_reasons_top") - if phase_line: - lines.append(phase_line) - if namespace_issue: - _append_namespace_issue_lines(lines, namespace_issue) - - -def _reason_line(entries: list[dict[str, Any]], label: str) -> str: - parts = [] - for entry in entries[:5]: - if not isinstance(entry, dict): - continue - reason = entry.get("reason") - count = entry.get("count") - if reason: - parts.append(f"{reason}={count}") - if parts: - return f"{label}: " + "; ".join(parts) - return "" - - -def _append_namespace_issue_lines(lines: list[str], namespace_issue: dict[str, Any]) -> None: - for key, entries in namespace_issue.items(): - if not isinstance(entries, list) or not entries: - continue - parts: list[str] = [] - for entry in entries[:5]: - if not isinstance(entry, dict): - continue - ns = entry.get("namespace") - value = entry.get("value") - if ns: - parts.append(f"{ns}={value}") - if parts: - lines.append(f"namespace_issue_top_{key}: " + "; ".join(parts)) - - -def _build_cluster_watchlist(summary: dict[str, Any]) -> dict[str, Any]: - items: list[str] = [] - nodes_summary = summary.get("nodes_summary") if isinstance(summary.get("nodes_summary"), dict) else {} - not_ready = int(nodes_summary.get("not_ready") or 0) - if not_ready > 0: - items.append(f"not_ready_nodes={not_ready}") - pressure = summary.get("pressure_nodes") if isinstance(summary.get("pressure_nodes"), dict) else {} - pressure_nodes = pressure.get("names") if isinstance(pressure.get("names"), list) else [] - if pressure_nodes: - items.append(f"pressure_nodes={len(pressure_nodes)}") - pod_issues = summary.get("pod_issues") if isinstance(summary.get("pod_issues"), dict) else {} - pending_over = int(pod_issues.get("pending_over_15m") or 0) - if pending_over > 0: - items.append(f"pods_pending_over_15m={pending_over}") - workloads = summary.get("workloads_health") if isinstance(summary.get("workloads_health"), dict) else {} - deployments = workloads.get("deployments") if isinstance(workloads.get("deployments"), dict) else {} - statefulsets = workloads.get("statefulsets") if isinstance(workloads.get("statefulsets"), dict) else {} - daemonsets = workloads.get("daemonsets") if isinstance(workloads.get("daemonsets"), dict) else {} - total_not_ready = int(deployments.get("not_ready") or 0) + int(statefulsets.get("not_ready") or 0) + int(daemonsets.get("not_ready") or 0) - if total_not_ready > 0: - items.append(f"workloads_not_ready={total_not_ready}") - flux = summary.get("flux") if isinstance(summary.get("flux"), dict) else {} - flux_not_ready = int(flux.get("not_ready") or 0) - if flux_not_ready > 0: - items.append(f"flux_not_ready={flux_not_ready}") - pvc_usage = summary.get("pvc_usage_top") if isinstance(summary.get("pvc_usage_top"), list) else [] - high_pvc = [ - entry for entry in pvc_usage if isinstance(entry, dict) and (entry.get("value") or 0) >= PVC_USAGE_CRITICAL - ] - if high_pvc: - items.append(f"pvc_usage>={PVC_USAGE_CRITICAL}%") - return {"cluster_watchlist": items} if items else {} - - -def _capacity_ratio_parts(entries: list[dict[str, Any]], ratio_key: str, usage_key: str, req_key: str) -> list[str]: - parts: list[str] = [] - for entry in entries[:5]: - if not isinstance(entry, dict): - continue - ns = entry.get("namespace") or "" - ratio = entry.get(ratio_key) - usage = entry.get(usage_key) - req = entry.get(req_key) - if ns: - parts.append( - f"{ns}={_format_float(ratio)} (usage={_format_float(usage)} req={_format_float(req)})" - ) - return parts - - -def _capacity_headroom_parts(entries: list[dict[str, Any]]) -> list[str]: - parts: list[str] = [] - for entry in entries[:5]: - if not isinstance(entry, dict): - continue - ns = entry.get("namespace") or "" - headroom = entry.get("headroom") - if ns: - parts.append(f"{ns}={_format_float(headroom)}") - return parts - - -def _append_namespace_capacity_summary( # noqa: C901, PLR0912 - lines: list[str], - summary: dict[str, Any], -) -> None: - cap = summary.get("namespace_capacity_summary") - if not isinstance(cap, dict) or not cap: - return - cpu_ratio = cap.get("cpu_ratio_top") - if isinstance(cpu_ratio, list): - parts = _capacity_ratio_parts(cpu_ratio, "cpu_usage_ratio", "cpu_usage", "cpu_requests") - if parts: - lines.append("namespace_cpu_ratio_top: " + "; ".join(parts)) - mem_ratio = cap.get("mem_ratio_top") - if isinstance(mem_ratio, list): - parts = _capacity_ratio_parts(mem_ratio, "mem_usage_ratio", "mem_usage", "mem_requests") - if parts: - lines.append("namespace_mem_ratio_top: " + "; ".join(parts)) - cpu_headroom = cap.get("cpu_headroom_low") - if isinstance(cpu_headroom, list): - parts = _capacity_headroom_parts(cpu_headroom) - if parts: - lines.append("namespace_cpu_headroom_low: " + "; ".join(parts)) - mem_headroom = cap.get("mem_headroom_low") - if isinstance(mem_headroom, list): - parts = _capacity_headroom_parts(mem_headroom) - if parts: - lines.append("namespace_mem_headroom_low: " + "; ".join(parts)) - cpu_over = cap.get("cpu_overcommitted") - mem_over = cap.get("mem_overcommitted") - if cpu_over is not None or mem_over is not None: - lines.append(f"namespace_overcommitted: cpu={cpu_over} mem={mem_over}") - cpu_over_names = cap.get("cpu_overcommitted_names") - if isinstance(cpu_over_names, list) and cpu_over_names: - names = [name for name in cpu_over_names if isinstance(name, str) and name] - if names: - lines.append("namespace_cpu_overcommitted_names: " + _format_names(names)) - mem_over_names = cap.get("mem_overcommitted_names") - if isinstance(mem_over_names, list) and mem_over_names: - names = [name for name in mem_over_names if isinstance(name, str) and name] - if names: - lines.append("namespace_mem_overcommitted_names: " + _format_names(names)) - - -def _append_workloads_by_namespace(lines: list[str], summary: dict[str, Any]) -> None: - workloads = summary.get("workloads") - if not isinstance(workloads, list) or not workloads: - return - by_ns: dict[str, list[dict[str, Any]]] = {} - for item in workloads: - if not isinstance(item, dict): - continue - ns = item.get("namespace") or "" - name = item.get("workload") or "" - if not ns or not name: - continue - by_ns.setdefault(ns, []).append(item) - for ns, items in sorted(by_ns.items()): - items.sort( - key=lambda item: (-int(item.get("pods_total") or 0), item.get("workload") or "") - ) - parts = [] - for entry in items[:2]: - name = entry.get("workload") or "" - pods = entry.get("pods_total") - primary = entry.get("primary_node") - label = f"{name}({pods})" if pods is not None else name - if primary: - label = f"{label}@{primary}" - if label: - parts.append(label) - if parts: - lines.append(f"workloads_top_{ns}: " + "; ".join(parts)) - - -def _append_lexicon(lines: list[str], summary: dict[str, Any]) -> None: - lexicon = summary.get("lexicon") - if not isinstance(lexicon, dict): - return - terms = lexicon.get("terms") if isinstance(lexicon.get("terms"), list) else [] - aliases = lexicon.get("aliases") if isinstance(lexicon.get("aliases"), dict) else {} - for entry in terms[:8]: - if not isinstance(entry, dict): - continue - term = entry.get("term") - meaning = entry.get("meaning") - if term and meaning: - lines.append(f"lexicon_term: {term} => {meaning}") - for key, value in list(aliases.items())[:6]: - if key and value: - lines.append(f"lexicon_alias: {key} => {value}") - - -def _append_cross_stats(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901 - cross_stats = summary.get("cross_stats") - if not isinstance(cross_stats, dict): - return - node_entries = cross_stats.get("node_metric_top") if isinstance(cross_stats.get("node_metric_top"), list) else [] - for entry in node_entries[:10]: - if not isinstance(entry, dict): - continue - metric = entry.get("metric") - node = entry.get("node") - value = entry.get("value") - cpu = entry.get("cpu") - ram = entry.get("ram") - net = entry.get("net") - io = entry.get("io") - pods = entry.get("pods_total") - if metric and node: - parts = [ - f"value={_format_float(value)}", - f"cpu={_format_float(cpu)}", - f"ram={_format_float(ram)}", - f"net={_format_float(net)}", - f"io={_format_float(io)}", - ] - if pods is not None: - parts.append(f"pods={pods}") - lines.append(f"cross_node_{metric}: {node} " + " ".join(parts)) - ns_entries = cross_stats.get("namespace_metric_top") if isinstance(cross_stats.get("namespace_metric_top"), list) else [] - for entry in ns_entries[:10]: - if not isinstance(entry, dict): - continue - metric = entry.get("metric") - namespace = entry.get("namespace") - value = entry.get("value") - pods = entry.get("pods_total") - cpu_ratio = entry.get("cpu_ratio") - mem_ratio = entry.get("mem_ratio") - if metric and namespace: - parts = [ - f"value={_format_float(value)}", - f"cpu_ratio={_format_float(cpu_ratio)}", - f"mem_ratio={_format_float(mem_ratio)}", - ] - if pods is not None: - parts.append(f"pods={pods}") - lines.append(f"cross_namespace_{metric}: {namespace} " + " ".join(parts)) - pvc_entries = cross_stats.get("pvc_top") if isinstance(cross_stats.get("pvc_top"), list) else [] - for entry in pvc_entries[:5]: - if not isinstance(entry, dict): - continue - namespace = entry.get("namespace") - pvc = entry.get("pvc") - used = entry.get("used_percent") - if namespace and pvc: - lines.append(f"cross_pvc_usage: {namespace}/{pvc} used={_format_float(used)}") - - -def summary_text(snapshot: dict[str, Any] | None) -> str: # noqa: PLR0915 - summary = build_summary(snapshot) - if not summary: - return "" - lines: list[str] = [] - lines.append("atlas_cluster: Titan Lab Atlas Kubernetes cluster (internal).") - collected_at = snapshot.get("collected_at") if isinstance(snapshot, dict) else None - snapshot_version = snapshot.get("snapshot_version") if isinstance(snapshot, dict) else None - if collected_at or snapshot_version: - bits = [] - if collected_at: - bits.append(f"collected_at={collected_at}") - if snapshot_version: - bits.append(f"version={snapshot_version}") - lines.append("snapshot: " + ", ".join(bits)) - _append_nodes(lines, summary) - _append_hardware(lines, summary) - _append_hardware_groups(lines, summary) - _append_lexicon(lines, summary) - _append_pressure(lines, summary) - _append_node_facts(lines, summary) - _append_node_ages(lines, summary) - _append_node_taints(lines, summary) - _append_capacity(lines, summary) - _append_pods(lines, summary) - _append_namespace_pods(lines, summary) - _append_namespace_nodes(lines, summary) - _append_node_pods(lines, summary) - _append_pod_issues(lines, summary) - _append_pod_issue_summary(lines, summary) - _append_workload_health(lines, summary) - _append_events(lines, summary) - _append_node_usage_stats(lines, summary) - _append_namespace_usage(lines, summary) - _append_namespace_requests(lines, summary) - _append_namespace_io_net(lines, summary) - _append_pod_usage(lines, summary) - _append_restarts(lines, summary) - _append_job_failures(lines, summary) - _append_jobs(lines, summary) - _append_postgres(lines, summary) - _append_hottest(lines, summary) - _append_pvc_usage(lines, summary) - _append_root_disk_headroom(lines, summary) - _append_namespace_capacity_summary(lines, summary) - _append_baseline_deltas(lines, summary) - _append_longhorn(lines, summary) - _append_workloads(lines, summary) - _append_topology(lines, summary) - _append_workloads_by_namespace(lines, summary) - _append_node_load_summary(lines, summary) - _append_cluster_watchlist(lines, summary) - _append_hardware_usage(lines, summary) - _append_cross_stats(lines, summary) - _append_flux(lines, summary) - _append_signals(lines, summary) - _append_profiles(lines, summary) - _append_units_windows(lines, summary) - return "\n".join(lines) diff --git a/atlasbot/snapshot/builder/__init__.py b/atlasbot/snapshot/builder/__init__.py new file mode 100644 index 0000000..199e12f --- /dev/null +++ b/atlasbot/snapshot/builder/__init__.py @@ -0,0 +1,8 @@ +"""Snapshot summary builder and text render helpers.""" + +from .core_a import * +from .core_b import * +from .format_a import * +from .format_b import * +from .format_c import * +from .summary_text import * diff --git a/atlasbot/snapshot/builder/core_a.py b/atlasbot/snapshot/builder/core_a.py new file mode 100644 index 0000000..9a48a9c --- /dev/null +++ b/atlasbot/snapshot/builder/core_a.py @@ -0,0 +1,492 @@ +from __future__ import annotations + +import logging +import time +from typing import Any + +import httpx + +from atlasbot.config import Settings + +log = logging.getLogger(__name__) + +PVC_USAGE_CRITICAL = 90 + +_BYTES_KB = 1024 +_BYTES_MB = 1024 * 1024 +_BYTES_GB = 1024 * 1024 * 1024 +_VALUE_PAIR_LEN = 2 +class SnapshotProvider: + """Fetch and cache the Ariadne snapshot used by the answer engine.""" + + def __init__(self, settings: Settings) -> None: + self._settings = settings + self._cache: dict[str, Any] = {} + self._cache_ts = 0.0 + + def _cache_valid(self) -> bool: + return time.monotonic() - self._cache_ts < max(5, self._settings.snapshot_ttl_sec) + + def get(self) -> dict[str, Any] | None: + """Return the cached snapshot or refresh it from Ariadne.""" + + if self._cache and self._cache_valid(): + return self._cache + if not self._settings.ariadne_state_url: + return self._cache or None + headers = {} + if self._settings.ariadne_state_token: + headers["x-internal-token"] = self._settings.ariadne_state_token + try: + resp = httpx.get(self._settings.ariadne_state_url, headers=headers, timeout=10.0) + resp.raise_for_status() + payload = resp.json() + if isinstance(payload, dict): + self._cache = payload + self._cache_ts = time.monotonic() + return payload + except Exception as exc: + log.warning("snapshot fetch failed", extra={"extra": {"error": str(exc)}}) + return self._cache or None + + +def _node_usage_top(series: list[dict[str, Any]]) -> dict[str, Any] | None: + best = None + for entry in series or []: + if not isinstance(entry, dict): + continue + node = entry.get("node") + value = entry.get("value") + try: + numeric = float(value) + except (TypeError, ValueError): + continue + if best is None or numeric > best["value"]: + best = {"node": node, "value": numeric} + return best + + +def build_summary(snapshot: dict[str, Any] | None) -> dict[str, Any]: + """Condense a raw snapshot into the summary shape used for prompts.""" + + if not snapshot: + return {} + from .core_b import ( + _build_flux, + _build_hottest, + _build_namespace_capacity, + _build_namespace_capacity_summary, + _build_node_load_summary, + _build_pvc, + _build_workloads, + ) + from .format_c import _build_cluster_watchlist + + nodes_detail = _nodes_detail(snapshot) + metrics = _metrics(snapshot) + summary: dict[str, Any] = {} + + if isinstance(snapshot.get("nodes_summary"), dict): + summary["nodes_summary"] = snapshot.get("nodes_summary") + if metrics: + summary["metrics"] = metrics + if isinstance(snapshot.get("jobs"), dict): + summary["jobs"] = snapshot.get("jobs") + summary.update(_build_nodes(snapshot)) + summary.update(_build_pressure(snapshot)) + summary.update(_build_hardware(nodes_detail)) + summary.update(_build_hardware_by_node(nodes_detail)) + summary.update(_build_hardware_usage(metrics, summary.get("hardware_by_node"))) + summary.update(_build_node_facts(nodes_detail)) + summary.update(_build_node_ages(nodes_detail)) + summary.update(_build_node_taints(nodes_detail)) + summary.update(_build_capacity(metrics)) + summary.update(_build_pods(metrics)) + summary.update(_build_namespace_pods(snapshot)) + summary.update(_build_namespace_nodes(snapshot)) + summary.update(_build_node_pods(snapshot)) + summary.update(_build_node_pods_top(metrics)) + summary.update(_build_pod_issues(snapshot)) + summary.update(_build_workload_health(snapshot)) + summary.update(_build_events(snapshot)) + summary.update(_build_event_summary(snapshot)) + summary.update(_build_postgres(metrics)) + summary.update(_build_hottest(metrics)) + summary.update(_build_pvc(metrics)) + summary.update(_build_namespace_capacity(metrics)) + summary.update(_build_namespace_capacity_summary(metrics)) + summary.update(_build_longhorn(snapshot)) + summary.update(_build_root_disk_headroom(metrics)) + summary.update(_build_node_load(metrics)) + summary.update(_build_node_load_summary(metrics)) + summary.update(_build_cluster_watchlist(summary)) + summary.update(_build_workloads(snapshot)) + summary.update(_build_flux(snapshot)) + _merge_cluster_summary(snapshot, summary) + _augment_lexicon(summary) + return summary + + +def _merge_cluster_summary(snapshot: dict[str, Any], summary: dict[str, Any]) -> None: + cluster_summary = snapshot.get("summary") if isinstance(snapshot.get("summary"), dict) else {} + if not cluster_summary: + return + _merge_cluster_fields( + summary, + cluster_summary, + { + "signals": list, + "profiles": dict, + "inventory": dict, + "topology": dict, + "lexicon": dict, + "cross_stats": dict, + "baseline_deltas": dict, + "pod_issue_summary": dict, + "trend_requests": dict, + "pod_waiting_trends": dict, + "pod_terminated_trends": dict, + }, + ) + + +def _merge_cluster_fields(summary: dict[str, Any], cluster_summary: dict[str, Any], field_types: dict[str, type]) -> None: + for key, expected in field_types.items(): + value = cluster_summary.get(key) + if isinstance(value, expected): + summary[key] = value + + +def _augment_lexicon(summary: dict[str, Any]) -> None: + lexicon = summary.get("lexicon") + if not isinstance(lexicon, dict): + lexicon = {"terms": [], "aliases": {}} + terms = list(lexicon.get("terms") or []) + aliases = dict(lexicon.get("aliases") or {}) + hardware = summary.get("hardware") if isinstance(summary.get("hardware"), dict) else {} + hardware_map = { + "rpi5": "Raspberry Pi 5 nodes", + "rpi4": "Raspberry Pi 4 nodes", + "rpi": "Raspberry Pi nodes", + "jetson": "NVIDIA Jetson nodes", + "amd64": "AMD64 nodes", + } + existing_terms = {entry.get("term") for entry in terms if isinstance(entry, dict)} + for key, meaning in hardware_map.items(): + if key not in hardware: + continue + if key not in existing_terms: + terms.append({"term": key, "meaning": meaning}) + if key not in aliases: + aliases[key] = meaning + if "raspberry pi 5" not in aliases and "rpi5" in hardware: + aliases["raspberry pi 5"] = "rpi5" + if "raspberry pi 4" not in aliases and "rpi4" in hardware: + aliases["raspberry pi 4"] = "rpi4" + lexicon["terms"] = terms + lexicon["aliases"] = aliases + summary["lexicon"] = lexicon + + +def _nodes_detail(snapshot: dict[str, Any]) -> list[dict[str, Any]]: + items = snapshot.get("nodes_detail") + return items if isinstance(items, list) else [] + + +def _metrics(snapshot: dict[str, Any]) -> dict[str, Any]: + metrics = snapshot.get("metrics") + return metrics if isinstance(metrics, dict) else {} + + +def _build_nodes(snapshot: dict[str, Any]) -> dict[str, Any]: + nodes_summary = snapshot.get("nodes_summary") if isinstance(snapshot.get("nodes_summary"), dict) else {} + if not nodes_summary: + return {} + return { + "nodes": { + "total": nodes_summary.get("total"), + "ready": nodes_summary.get("ready"), + "not_ready": nodes_summary.get("not_ready"), + } + } + + +def _build_pressure(snapshot: dict[str, Any]) -> dict[str, Any]: + nodes_summary = snapshot.get("nodes_summary") if isinstance(snapshot.get("nodes_summary"), dict) else {} + pressure = nodes_summary.get("pressure_nodes") if isinstance(nodes_summary.get("pressure_nodes"), dict) else {} + if not pressure: + return {} + return {"pressure_nodes": pressure} + + +def _build_hardware(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: + hardware: dict[str, list[str]] = {} + for node in nodes_detail or []: + if not isinstance(node, dict): + continue + name = node.get("name") + hardware_class = node.get("hardware") or "unknown" + if name: + hardware.setdefault(hardware_class, []).append(name) + if not hardware: + return {} + return {"hardware": {key: sorted(value) for key, value in hardware.items()}} + + +def _build_hardware_by_node(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: + mapping: dict[str, str] = {} + for node in nodes_detail or []: + if not isinstance(node, dict): + continue + name = node.get("name") + if isinstance(name, str) and name: + hardware = node.get("hardware") or "unknown" + mapping[name] = str(hardware) + return {"hardware_by_node": mapping} if mapping else {} + + +def _build_hardware_usage(metrics: dict[str, Any], hardware_by_node: dict[str, Any] | None) -> dict[str, Any]: # noqa: C901 + if not isinstance(hardware_by_node, dict) or not hardware_by_node: + return {} + node_load = metrics.get("node_load") if isinstance(metrics.get("node_load"), list) else [] + if not node_load: + return {} + buckets: dict[str, dict[str, list[float]]] = {} + for entry in node_load: + if not isinstance(entry, dict): + continue + node = entry.get("node") + if not isinstance(node, str) or not node: + continue + hardware = hardware_by_node.get(node, "unknown") + bucket = buckets.setdefault(str(hardware), {"load_index": [], "cpu": [], "ram": [], "net": [], "io": []}) + for key in ("load_index", "cpu", "ram", "net", "io"): + value = entry.get(key) + if isinstance(value, (int, float)): + bucket[key].append(float(value)) + output: list[dict[str, Any]] = [] + for hardware, metrics_bucket in buckets.items(): + row: dict[str, Any] = {"hardware": hardware} + for key, values in metrics_bucket.items(): + if values: + row[key] = sum(values) / len(values) + output.append(row) + output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("hardware") or "")) + return {"hardware_usage_avg": output} + + +def _build_node_ages(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: + ages: list[dict[str, Any]] = [] + for node in nodes_detail or []: + if not isinstance(node, dict): + continue + name = node.get("name") + age = node.get("age_hours") + if name and isinstance(age, (int, float)): + ages.append({"name": name, "age_hours": age}) + ages.sort(key=lambda item: -(item.get("age_hours") or 0)) + return {"node_ages": ages[:5]} if ages else {} + + +def _count_values(nodes_detail: list[dict[str, Any]], key: str) -> dict[str, int]: + counts: dict[str, int] = {} + for node in nodes_detail or []: + if not isinstance(node, dict): + continue + value = node.get(key) + if isinstance(value, str) and value: + counts[value] = counts.get(value, 0) + 1 + return counts + + +def _build_node_facts(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: + if not nodes_detail: + return {} + role_counts: dict[str, int] = {} + for node in nodes_detail: + if not isinstance(node, dict): + continue + if node.get("is_worker"): + role_counts["worker"] = role_counts.get("worker", 0) + 1 + roles = node.get("roles") + if isinstance(roles, list): + for role in roles: + if isinstance(role, str) and role: + role_counts[role] = role_counts.get(role, 0) + 1 + return { + "node_arch_counts": _count_values(nodes_detail, "arch"), + "node_os_counts": _count_values(nodes_detail, "os"), + "node_kubelet_versions": _count_values(nodes_detail, "kubelet"), + "node_kernel_versions": _count_values(nodes_detail, "kernel"), + "node_runtime_versions": _count_values(nodes_detail, "container_runtime"), + "node_role_counts": role_counts, + } + + +def _build_node_taints(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: + taints: dict[str, list[str]] = {} + for node in nodes_detail or []: + if not isinstance(node, dict): + continue + name = node.get("name") + if not name: + continue + entries = node.get("taints") if isinstance(node.get("taints"), list) else [] + for entry in entries: + if not isinstance(entry, dict): + continue + key = entry.get("key") + effect = entry.get("effect") + if isinstance(key, str) and isinstance(effect, str): + label = f"{key}:{effect}" + taints.setdefault(label, []).append(name) + if not taints: + return {} + return {"node_taints": {key: sorted(names) for key, names in taints.items()}} + + +def _build_root_disk_headroom(metrics: dict[str, Any]) -> dict[str, Any]: + node_usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {} + disk = node_usage.get("disk") if isinstance(node_usage.get("disk"), list) else [] + if not disk: + return {} + entries = [] + for entry in disk: + if not isinstance(entry, dict): + continue + node = entry.get("node") + try: + used_pct = float(entry.get("value")) + except (TypeError, ValueError): + continue + headroom = max(0.0, 100.0 - used_pct) + if node: + entries.append({"node": node, "headroom_pct": headroom, "used_pct": used_pct}) + entries.sort(key=lambda item: (item.get("headroom_pct") or 0.0, item.get("node") or "")) + return {"root_disk_low_headroom": entries[:5]} if entries else {} + + +def _build_longhorn(snapshot: dict[str, Any]) -> dict[str, Any]: + longhorn = snapshot.get("longhorn") + return {"longhorn": longhorn} if isinstance(longhorn, dict) and longhorn else {} + + +def _build_node_load(metrics: dict[str, Any]) -> dict[str, Any]: + node_load = metrics.get("node_load") + if not isinstance(node_load, list) or not node_load: + return {} + return {"node_load": node_load} + + +def _build_pods(metrics: dict[str, Any]) -> dict[str, Any]: + pods = { + "running": metrics.get("pods_running"), + "pending": metrics.get("pods_pending"), + "failed": metrics.get("pods_failed"), + "succeeded": metrics.get("pods_succeeded"), + } + if not any(value is not None for value in pods.values()): + return {} + return {"pods": pods} + + +def _build_capacity(metrics: dict[str, Any]) -> dict[str, Any]: + if not metrics: + return {} + capacity = { + "cpu": metrics.get("capacity_cpu"), + "allocatable_cpu": metrics.get("allocatable_cpu"), + "mem_bytes": metrics.get("capacity_mem_bytes"), + "allocatable_mem_bytes": metrics.get("allocatable_mem_bytes"), + "pods": metrics.get("capacity_pods"), + "allocatable_pods": metrics.get("allocatable_pods"), + } + if not any(value is not None for value in capacity.values()): + return {} + return {"capacity": capacity} + + +def _build_namespace_pods(snapshot: dict[str, Any]) -> dict[str, Any]: + namespaces = snapshot.get("namespace_pods") + if not isinstance(namespaces, list) or not namespaces: + return {} + return {"namespace_pods": namespaces} + + +def _build_namespace_nodes(snapshot: dict[str, Any]) -> dict[str, Any]: + namespace_nodes = snapshot.get("namespace_nodes") + if not isinstance(namespace_nodes, list) or not namespace_nodes: + return {} + return {"namespace_nodes": namespace_nodes} + + +def _build_node_pods(snapshot: dict[str, Any]) -> dict[str, Any]: + node_pods = snapshot.get("node_pods") + if not isinstance(node_pods, list) or not node_pods: + return {} + return {"node_pods": node_pods} + + +def _build_node_pods_top(metrics: dict[str, Any]) -> dict[str, Any]: + top = metrics.get("node_pods_top") + if not isinstance(top, list) or not top: + return {} + return {"node_pods_top": top} + + +def _build_pod_issues(snapshot: dict[str, Any]) -> dict[str, Any]: + pod_issues = snapshot.get("pod_issues") + if not isinstance(pod_issues, dict) or not pod_issues: + return {} + return {"pod_issues": pod_issues} + + +def _build_workload_health(snapshot: dict[str, Any]) -> dict[str, Any]: + health = snapshot.get("workloads_health") + if not isinstance(health, dict) or not health: + return {} + deployments = health.get("deployments") + statefulsets = health.get("statefulsets") + daemonsets = health.get("daemonsets") + if not isinstance(deployments, dict) or not isinstance(statefulsets, dict) or not isinstance(daemonsets, dict): + return {} + return { + "workloads_health": { + "deployments": deployments, + "statefulsets": statefulsets, + "daemonsets": daemonsets, + } + } + + +def _build_events(snapshot: dict[str, Any]) -> dict[str, Any]: + events = snapshot.get("events") + if not isinstance(events, dict) or not events: + return {} + return {"events": events} + + +def _build_event_summary(snapshot: dict[str, Any]) -> dict[str, Any]: + events = snapshot.get("events") + if not isinstance(events, dict) or not events: + return {} + summary = {} + if isinstance(events.get("warnings_top_reason"), dict): + summary["warnings_top_reason"] = events.get("warnings_top_reason") + if events.get("warnings_latest"): + summary["warnings_latest"] = events.get("warnings_latest") + return {"event_summary": summary} if summary else {} + + +def _build_postgres(metrics: dict[str, Any]) -> dict[str, Any]: + postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} + if not postgres: + return {} + return { + "postgres": { + "used": postgres.get("used"), + "max": postgres.get("max"), + "hottest_db": postgres.get("hottest_db"), + "by_db": postgres.get("by_db"), + } + } diff --git a/atlasbot/snapshot/builder/core_b.py b/atlasbot/snapshot/builder/core_b.py new file mode 100644 index 0000000..8696caa --- /dev/null +++ b/atlasbot/snapshot/builder/core_b.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from typing import Any + +from .core_a import _node_usage_top + +def _build_hottest(metrics: dict[str, Any]) -> dict[str, Any]: + node_usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {} + hottest: dict[str, Any] = {} + for key in ("cpu", "ram", "net", "io", "disk"): + top = _node_usage_top(node_usage.get(key, [])) + if top: + hottest[key] = top + if not hottest: + return {} + return {"hottest": hottest} + + +def _build_pvc(metrics: dict[str, Any]) -> dict[str, Any]: + pvc_usage = metrics.get("pvc_usage_top") if isinstance(metrics.get("pvc_usage_top"), list) else [] + if not pvc_usage: + return {} + return {"pvc_usage_top": pvc_usage} + + +def _build_namespace_capacity(metrics: dict[str, Any]) -> dict[str, Any]: + capacity = metrics.get("namespace_capacity") + if not isinstance(capacity, list) or not capacity: + return {} + return {"namespace_capacity": capacity} + + +def _build_namespace_capacity_summary(metrics: dict[str, Any]) -> dict[str, Any]: + summary = metrics.get("namespace_capacity_summary") + if not isinstance(summary, dict) or not summary: + return {} + return {"namespace_capacity_summary": summary} + + +def _build_node_load_summary(metrics: dict[str, Any]) -> dict[str, Any]: + summary = metrics.get("node_load_summary") + if not isinstance(summary, dict) or not summary: + return {} + return {"node_load_summary": summary} + + +def _build_workloads(snapshot: dict[str, Any]) -> dict[str, Any]: + workloads = snapshot.get("workloads") if isinstance(snapshot.get("workloads"), list) else [] + return {"workloads": workloads} + + +def _build_flux(snapshot: dict[str, Any]) -> dict[str, Any]: + flux = snapshot.get("flux") if isinstance(snapshot.get("flux"), dict) else {} + return {"flux": flux} + + +__all__ = [name for name in globals() if not name.startswith("__")] diff --git a/atlasbot/snapshot/builder/format_a.py b/atlasbot/snapshot/builder/format_a.py new file mode 100644 index 0000000..36425fa --- /dev/null +++ b/atlasbot/snapshot/builder/format_a.py @@ -0,0 +1,497 @@ +from __future__ import annotations + +from typing import Any + +from .core_a import _BYTES_GB, _BYTES_KB, _BYTES_MB +from .core_b import * + + +def _format_float(value: Any) -> str: + try: + numeric = float(value) + except (TypeError, ValueError): + return str(value) + return f"{numeric:.2f}".rstrip("0").rstrip(".") + + +def _format_rate_bytes(value: Any) -> str: + try: + numeric = float(value) + except (TypeError, ValueError): + return str(value) + if numeric >= _BYTES_MB: + return f"{numeric / _BYTES_MB:.2f} MB/s" + if numeric >= _BYTES_KB: + return f"{numeric / _BYTES_KB:.2f} KB/s" + return f"{numeric:.2f} B/s" + + +def _format_bytes(value: Any) -> str: + try: + numeric = float(value) + except (TypeError, ValueError): + return str(value) + if numeric >= _BYTES_GB: + return f"{numeric / _BYTES_GB:.2f} GB" + if numeric >= _BYTES_MB: + return f"{numeric / _BYTES_MB:.2f} MB" + if numeric >= _BYTES_KB: + return f"{numeric / _BYTES_KB:.2f} KB" + return f"{numeric:.2f} B" + + +def _format_kv_map(values: dict[str, Any]) -> str: + parts = [] + for key, value in values.items(): + parts.append(f"{key}={value}") + return ", ".join(parts) + + +def _format_names(names: list[str]) -> str: + if not names: + return "" + return ", ".join(sorted(names)) + + +def _append_nodes(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901 + nodes = summary.get("nodes") if isinstance(summary.get("nodes"), dict) else {} + if not nodes: + return + workers = {} + if isinstance(summary.get("nodes_summary"), dict): + workers = summary["nodes_summary"].get("workers") or {} + workers_total = workers.get("total") + workers_ready = workers.get("ready") + workers_str = "" + if workers_total is not None and workers_ready is not None: + workers_str = f", workers_ready={workers_ready}/{workers_total}" + total = nodes.get("total") + ready = nodes.get("ready") + not_ready = nodes.get("not_ready") + if not_ready is None: + not_ready = 0 + lines.append(f"nodes: total={total}, ready={ready}, not_ready={not_ready}{workers_str}") + if total is not None: + lines.append(f"nodes_total: {total}") + if ready is not None: + lines.append(f"nodes_ready: {ready}") + if not_ready is not None: + lines.append(f"nodes_not_ready_count: {not_ready}") + if not isinstance(summary.get("nodes_summary"), dict): + return + not_ready_names = summary["nodes_summary"].get("not_ready_names") or [] + if not_ready_names: + lines.append("nodes_not_ready: " + _format_names(not_ready_names)) + by_arch = summary["nodes_summary"].get("by_arch") or {} + if isinstance(by_arch, dict) and by_arch: + lines.append("archs: " + _format_kv_map(by_arch)) + by_role = summary["nodes_summary"].get("by_role") or {} + if isinstance(by_role, dict) and by_role: + lines.append("roles: " + _format_kv_map(by_role)) + + +def _append_hardware(lines: list[str], summary: dict[str, Any]) -> None: + hardware = summary.get("hardware") if isinstance(summary.get("hardware"), dict) else {} + if not hardware: + return + parts = [] + for key, names in hardware.items(): + if not isinstance(names, list): + continue + label = f"{key}={len(names)}" + name_list = _format_names([str(name) for name in names if name]) + if name_list: + label = f"{label} ({name_list})" + parts.append(label) + if parts: + lines.append("hardware: " + "; ".join(sorted(parts))) + + +def _append_hardware_groups(lines: list[str], summary: dict[str, Any]) -> None: + hardware = summary.get("hardware") if isinstance(summary.get("hardware"), dict) else {} + if not hardware: + return + parts = [] + for key, names in hardware.items(): + if not isinstance(names, list): + continue + name_list = _format_names([str(name) for name in names if name]) + if name_list: + parts.append(f"{key}={name_list}") + if parts: + lines.append("hardware_nodes: " + "; ".join(sorted(parts))) + + +def _append_node_ages(lines: list[str], summary: dict[str, Any]) -> None: + ages = summary.get("node_ages") if isinstance(summary.get("node_ages"), list) else [] + if not ages: + return + parts = [] + for entry in ages[:3]: + if not isinstance(entry, dict): + continue + name = entry.get("name") + age = entry.get("age_hours") + if name and isinstance(age, (int, float)): + parts.append(f"{name}={_format_float(age)}h") + if parts: + lines.append("node_age_top: " + "; ".join(parts)) + + +def _append_node_taints(lines: list[str], summary: dict[str, Any]) -> None: + taints = summary.get("node_taints") if isinstance(summary.get("node_taints"), dict) else {} + if not taints: + return + parts = [] + for key, names in taints.items(): + if not isinstance(names, list): + continue + name_list = _format_names([str(name) for name in names if name]) + parts.append(f"{key}={len(names)} ({name_list})" if name_list else f"{key}={len(names)}") + if parts: + lines.append("node_taints: " + "; ".join(sorted(parts))) + + +def _append_node_facts(lines: list[str], summary: dict[str, Any]) -> None: + def top_counts(label: str, counts: dict[str, int], limit: int = 4) -> None: + if not counts: + return + top = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:limit] + rendered = "; ".join([f"{name}={count}" for name, count in top]) + if rendered: + lines.append(f"{label}: {rendered}") + + top_counts("node_arch", summary.get("node_arch_counts") or {}) + top_counts("node_os", summary.get("node_os_counts") or {}) + top_counts("node_kubelet_versions", summary.get("node_kubelet_versions") or {}) + top_counts("node_kernel_versions", summary.get("node_kernel_versions") or {}) + top_counts("node_runtime_versions", summary.get("node_runtime_versions") or {}) + top_counts("node_roles", summary.get("node_role_counts") or {}) + + +def _append_pressure(lines: list[str], summary: dict[str, Any]) -> None: + pressure = summary.get("pressure_nodes") + if not isinstance(pressure, dict) or not pressure: + return + parts = [] + for cond, nodes in sorted(pressure.items()): + if not nodes: + continue + name_list = _format_names([str(name) for name in nodes if name]) + parts.append(f"{cond}={len(nodes)} ({name_list})" if name_list else f"{cond}={len(nodes)}") + if parts: + lines.append("node_pressure: " + "; ".join(parts)) + + +def _append_pods(lines: list[str], summary: dict[str, Any]) -> None: + pods = summary.get("pods") if isinstance(summary.get("pods"), dict) else {} + if not pods: + return + lines.append( + "pods: running={running}, pending={pending}, failed={failed}, succeeded={succeeded}".format( + running=pods.get("running"), + pending=pods.get("pending"), + failed=pods.get("failed"), + succeeded=pods.get("succeeded"), + ) + ) + + +def _append_capacity(lines: list[str], summary: dict[str, Any]) -> None: + capacity = summary.get("capacity") if isinstance(summary.get("capacity"), dict) else {} + if not capacity: + return + parts = [] + if capacity.get("cpu") is not None: + parts.append(f"cpu={_format_float(capacity.get('cpu'))}") + if capacity.get("allocatable_cpu") is not None: + parts.append(f"alloc_cpu={_format_float(capacity.get('allocatable_cpu'))}") + if capacity.get("mem_bytes") is not None: + parts.append(f"mem={_format_bytes(capacity.get('mem_bytes'))}") + if capacity.get("allocatable_mem_bytes") is not None: + parts.append(f"alloc_mem={_format_bytes(capacity.get('allocatable_mem_bytes'))}") + if capacity.get("pods") is not None: + parts.append(f"pods={_format_float(capacity.get('pods'))}") + if capacity.get("allocatable_pods") is not None: + parts.append(f"alloc_pods={_format_float(capacity.get('allocatable_pods'))}") + if parts: + lines.append("capacity: " + "; ".join(parts)) + + +def _append_namespace_pods(lines: list[str], summary: dict[str, Any]) -> None: + namespaces = summary.get("namespace_pods") + if not isinstance(namespaces, list) or not namespaces: + return + top = sorted( + (item for item in namespaces if isinstance(item, dict)), + key=lambda item: (-int(item.get("pods_total") or 0), item.get("namespace") or ""), + )[:8] + parts = [] + for item in top: + name = item.get("namespace") + total = item.get("pods_total") + running = item.get("pods_running") + if not name: + continue + label = f"{name}={total}" + if running is not None: + label = f"{label} (running={running})" + parts.append(label) + if parts: + lines.append("namespaces_top: " + "; ".join(parts)) + + +def _append_namespace_nodes(lines: list[str], summary: dict[str, Any]) -> None: + namespace_nodes = summary.get("namespace_nodes") + if not isinstance(namespace_nodes, list) or not namespace_nodes: + return + top = sorted( + (item for item in namespace_nodes if isinstance(item, dict)), + key=lambda item: (-int(item.get("pods_total") or 0), item.get("namespace") or ""), + )[:8] + parts = [] + for item in top: + namespace = item.get("namespace") + pods_total = item.get("pods_total") + primary = item.get("primary_node") + if namespace: + label = f"{namespace}={pods_total}" + if primary: + label = f"{label} (primary={primary})" + parts.append(label) + if parts: + lines.append("namespace_nodes_top: " + "; ".join(parts)) + + +def _append_node_pods(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901 + node_pods = summary.get("node_pods") + if not isinstance(node_pods, list) or not node_pods: + return + sortable: list[dict[str, Any]] = [] + for item in node_pods: + if not isinstance(item, dict): + continue + try: + pods_value = int(item.get("pods_total") or 0) + except (TypeError, ValueError): + continue + sortable.append({**item, "pods_total": pods_value}) + top = sorted(sortable, key=lambda item: (-int(item.get("pods_total") or 0), item.get("node") or ""))[:8] + max_entry = None + for entry in node_pods: + if not isinstance(entry, dict): + continue + pods_total = entry.get("pods_total") + try: + pods_value = int(pods_total) + except (TypeError, ValueError): + continue + if max_entry is None or pods_value > max_entry["pods_total"]: + max_entry = { + "node": entry.get("node"), + "pods_total": pods_value, + "namespaces_top": entry.get("namespaces_top") or [], + } + parts = [] + for item in top: + node = item.get("node") + pods_total = item.get("pods_total") + namespaces = item.get("namespaces_top") or [] + ns_label = "" + if namespaces: + ns_label = ", ".join([f"{name}={count}" for name, count in namespaces]) + if node: + label = f"{node}={pods_total}" + if ns_label: + label = f"{label} ({ns_label})" + parts.append(label) + if parts: + lines.append("node_pods_top: " + "; ".join(parts)) + if max_entry and isinstance(max_entry.get("node"), str): + ns_label = "" + namespaces = max_entry.get("namespaces_top") or [] + if namespaces: + ns_label = ", ".join([f"{name}={count}" for name, count in namespaces]) + label = f"{max_entry.get('node')}={max_entry.get('pods_total')}" + if ns_label: + label = f"{label} ({ns_label})" + lines.append("node_pods_max: " + label) + for item in top: + node = item.get("node") + namespaces = item.get("namespaces_top") or [] + if not node or not namespaces: + continue + ns_label = ", ".join([f"{name}={count}" for name, count in namespaces]) + lines.append(f"node_namespaces_top: {node} ({ns_label})") + + +def _append_pod_issues(lines: list[str], summary: dict[str, Any]) -> None: + pod_issues = summary.get("pod_issues") if isinstance(summary.get("pod_issues"), dict) else {} + if not pod_issues: + return + counts_line = _format_pod_issue_counts(pod_issues) + if counts_line: + lines.append(counts_line) + top_line = _format_pod_issue_top(pod_issues) + if top_line: + lines.append(top_line) + pending_line = _format_pod_pending_oldest(pod_issues) + if pending_line: + lines.append(pending_line) + pending_over_line = _format_pod_pending_over_15m(pod_issues) + if pending_over_line: + lines.append(pending_over_line) + reasons_line = _format_pod_waiting_reasons(pod_issues) + if reasons_line: + lines.append(reasons_line) + + +def _format_pod_issue_counts(pod_issues: dict[str, Any]) -> str: + counts = pod_issues.get("counts") if isinstance(pod_issues.get("counts"), dict) else {} + if not counts: + return "" + parts = [] + for key in ("Failed", "Pending", "Unknown"): + if key in counts: + parts.append(f"{key}={counts.get(key)}") + return "pod_issues: " + "; ".join(parts) if parts else "" + + +def _format_pod_issue_top(pod_issues: dict[str, Any]) -> str: + items = pod_issues.get("items") if isinstance(pod_issues.get("items"), list) else [] + if not items: + return "" + top = [] + for item in items[:5]: + if not isinstance(item, dict): + continue + namespace = item.get("namespace") + pod = item.get("pod") + if not namespace or not pod: + continue + phase = item.get("phase") or "" + restarts = item.get("restarts") or 0 + top.append(f"{namespace}/{pod}({phase},r={restarts})") + return "pod_issues_top: " + "; ".join(top) if top else "" + + +def _format_pod_pending_oldest(pod_issues: dict[str, Any]) -> str: + pending = pod_issues.get("pending_oldest") if isinstance(pod_issues.get("pending_oldest"), list) else [] + if not pending: + return "" + parts = [] + for item in pending[:5]: + if not isinstance(item, dict): + continue + namespace = item.get("namespace") + pod = item.get("pod") + age = item.get("age_hours") + reason = item.get("reason") or "" + if namespace and pod and age is not None: + label = f"{namespace}/{pod}={_format_float(age)}h" + if reason: + label = f"{label} ({reason})" + parts.append(label) + return "pods_pending_oldest: " + "; ".join(parts) if parts else "" + + +def _format_pod_waiting_reasons(pod_issues: dict[str, Any]) -> str: + reasons = pod_issues.get("waiting_reasons") if isinstance(pod_issues.get("waiting_reasons"), dict) else {} + if not reasons: + return "" + pairs = sorted(reasons.items(), key=lambda item: (-item[1], item[0]))[:5] + return "pod_waiting_reasons: " + "; ".join([f"{key}={val}" for key, val in pairs]) + + +def _format_pod_pending_over_15m(pod_issues: dict[str, Any]) -> str: + count = pod_issues.get("pending_over_15m") + if count is None: + return "" + try: + count_val = int(count) + except (TypeError, ValueError): + return "" + return f"pods_pending_over_15m: {count_val}" + + +def _append_workload_health(lines: list[str], summary: dict[str, Any]) -> None: + health = summary.get("workloads_health") if isinstance(summary.get("workloads_health"), dict) else {} + if not health: + return + deployments = health.get("deployments") if isinstance(health.get("deployments"), dict) else {} + statefulsets = health.get("statefulsets") if isinstance(health.get("statefulsets"), dict) else {} + daemonsets = health.get("daemonsets") if isinstance(health.get("daemonsets"), dict) else {} + total_not_ready = 0 + for entry in (deployments, statefulsets, daemonsets): + total_not_ready += int(entry.get("not_ready") or 0) + lines.append( + "workloads_not_ready: " + f"deployments={deployments.get('not_ready', 0)}, " + f"statefulsets={statefulsets.get('not_ready', 0)}, " + f"daemonsets={daemonsets.get('not_ready', 0)} " + f"(total={total_not_ready})" + ) + + +def _append_node_usage_stats(lines: list[str], summary: dict[str, Any]) -> None: + metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} + stats = metrics.get("node_usage_stats") if isinstance(metrics.get("node_usage_stats"), dict) else {} + if not stats: + return + parts = [] + for key in ("cpu", "ram", "net", "io", "disk"): + entry = stats.get(key) if isinstance(stats.get(key), dict) else {} + avg = entry.get("avg") + if avg is None: + continue + value = _format_rate_bytes(avg) if key in {"net", "io"} else _format_float(avg) + parts.append(f"{key}={value}") + if parts: + lines.append("node_usage_avg: " + "; ".join(parts)) + + +def _append_events(lines: list[str], summary: dict[str, Any]) -> None: + events = summary.get("events") if isinstance(summary.get("events"), dict) else {} + if not events: + return + total = events.get("warnings_total") + by_reason = events.get("warnings_by_reason") if isinstance(events.get("warnings_by_reason"), dict) else {} + if total is None: + return + if by_reason: + top = sorted(by_reason.items(), key=lambda item: (-item[1], item[0]))[:3] + reasons = "; ".join([f"{reason}={count}" for reason, count in top]) + lines.append(f"warnings: total={total}; top={reasons}") + else: + lines.append(f"warnings: total={total}") +def _append_pvc_usage(lines: list[str], summary: dict[str, Any]) -> None: + pvc_usage = summary.get("pvc_usage_top") + if not isinstance(pvc_usage, list) or not pvc_usage: + return + parts = [] + for entry in pvc_usage: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {} + namespace = metric.get("namespace") + pvc = metric.get("persistentvolumeclaim") + value = entry.get("value") + if namespace and pvc: + parts.append(f"{namespace}/{pvc}={_format_float(value)}%") + if parts: + lines.append("pvc_usage_top: " + "; ".join(parts)) +def _append_root_disk_headroom(lines: list[str], summary: dict[str, Any]) -> None: + headroom = summary.get("root_disk_low_headroom") + if not isinstance(headroom, list) or not headroom: + return + parts = [] + for entry in headroom: + if not isinstance(entry, dict): + continue + node = entry.get("node") + headroom_pct = entry.get("headroom_pct") + if node and headroom_pct is not None: + parts.append(f"{node}={_format_float(headroom_pct)}%") + if parts: + lines.append("root_disk_low_headroom: " + "; ".join(parts)) +__all__ = [name for name in globals() if not name.startswith("__")] diff --git a/atlasbot/snapshot/builder/format_b.py b/atlasbot/snapshot/builder/format_b.py new file mode 100644 index 0000000..8756d82 --- /dev/null +++ b/atlasbot/snapshot/builder/format_b.py @@ -0,0 +1,435 @@ +from __future__ import annotations + +from typing import Any + +from .core_a import _VALUE_PAIR_LEN +from .format_a import * + + +def _append_namespace_metric_series( + lines: list[str], + label: str, + entries: list[Any], + formatter: Any, +) -> None: + parts = [] + for entry in entries: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {} + namespace = metric.get("namespace") + value = entry.get("value") + if namespace: + parts.append(f"{namespace}={formatter(value)}") + if parts: + lines.append(f"{label}: " + "; ".join(parts)) + + +def _append_longhorn(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901 + longhorn = summary.get("longhorn") if isinstance(summary.get("longhorn"), dict) else {} + if not longhorn: + return + total = longhorn.get("total") + attached = longhorn.get("attached_count") + detached = longhorn.get("detached_count") + degraded = longhorn.get("degraded_count") + by_state = longhorn.get("by_state") if isinstance(longhorn.get("by_state"), dict) else {} + by_robust = longhorn.get("by_robustness") if isinstance(longhorn.get("by_robustness"), dict) else {} + if total is not None: + if attached is None and detached is None and degraded is None: + unhealthy = longhorn.get("unhealthy_count") + lines.append(f"longhorn: total={total}, unhealthy={unhealthy if unhealthy is not None else 0}") + else: + lines.append( + f"longhorn: total={total}, attached={attached if attached is not None else 0}, " + f"detached={detached if detached is not None else 0}, " + f"degraded={degraded if degraded is not None else 0}" + ) + if by_state: + lines.append("longhorn_state: " + _format_kv_map(by_state)) + if by_robust: + lines.append("longhorn_robustness: " + _format_kv_map(by_robust)) + unhealthy_items = longhorn.get("unhealthy") + if isinstance(unhealthy_items, list) and unhealthy_items: + parts = [] + for entry in unhealthy_items[:5]: + if not isinstance(entry, dict): + continue + name = entry.get("name") + state = entry.get("state") + robustness = entry.get("robustness") + if name: + label = name + if state or robustness: + label = f"{label}({state},{robustness})" + parts.append(label) + if parts: + lines.append("longhorn_unhealthy_top: " + "; ".join(parts)) + + +def _append_namespace_usage(lines: list[str], summary: dict[str, Any]) -> None: + metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} + cpu_top = metrics.get("namespace_cpu_top") if isinstance(metrics.get("namespace_cpu_top"), list) else [] + mem_top = metrics.get("namespace_mem_top") if isinstance(metrics.get("namespace_mem_top"), list) else [] + _append_namespace_metric_series(lines, "namespace_cpu_top", cpu_top, _format_float) + _append_namespace_metric_series(lines, "namespace_mem_top", mem_top, _format_bytes) + + +def _append_namespace_requests(lines: list[str], summary: dict[str, Any]) -> None: + metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} + cpu_req = metrics.get("namespace_cpu_requests_top") if isinstance(metrics.get("namespace_cpu_requests_top"), list) else [] + mem_req = metrics.get("namespace_mem_requests_top") if isinstance(metrics.get("namespace_mem_requests_top"), list) else [] + _append_namespace_metric_series(lines, "namespace_cpu_requests_top", cpu_req, _format_float) + _append_namespace_metric_series(lines, "namespace_mem_requests_top", mem_req, _format_bytes) + + +def _append_namespace_io_net(lines: list[str], summary: dict[str, Any]) -> None: + metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} + net_top = metrics.get("namespace_net_top") if isinstance(metrics.get("namespace_net_top"), list) else [] + io_top = metrics.get("namespace_io_top") if isinstance(metrics.get("namespace_io_top"), list) else [] + _append_namespace_metric_series(lines, "namespace_net_top", net_top, _format_rate_bytes) + _append_namespace_metric_series(lines, "namespace_io_top", io_top, _format_rate_bytes) + + +def _append_pod_usage(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901 + metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} + cpu_top = metrics.get("pod_cpu_top") if isinstance(metrics.get("pod_cpu_top"), list) else [] + cpu_top_node = ( + metrics.get("pod_cpu_top_node") + if isinstance(metrics.get("pod_cpu_top_node"), list) + else [] + ) + mem_top = metrics.get("pod_mem_top") if isinstance(metrics.get("pod_mem_top"), list) else [] + mem_top_node = ( + metrics.get("pod_mem_top_node") + if isinstance(metrics.get("pod_mem_top_node"), list) + else [] + ) + if cpu_top: + parts = [] + for entry in cpu_top: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {} + namespace = metric.get("namespace") + pod = metric.get("pod") + value = entry.get("value") + if namespace and pod and value is not None: + parts.append(f"{namespace}/{pod}={_format_float(value)}") + if parts: + lines.append("pod_cpu_top: " + "; ".join(parts)) + if cpu_top_node: + parts = [] + for entry in cpu_top_node: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {} + namespace = metric.get("namespace") + pod = metric.get("pod") + node = metric.get("node") + value = entry.get("value") + if namespace and pod and node and value is not None: + parts.append(f"{node}:{namespace}/{pod}={_format_float(value)}") + if parts: + lines.append("pod_cpu_top_node: " + "; ".join(parts)) + if mem_top: + parts = [] + for entry in mem_top: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {} + namespace = metric.get("namespace") + pod = metric.get("pod") + value = entry.get("value") + if namespace and pod and value is not None: + parts.append(f"{namespace}/{pod}={_format_bytes(value)}") + if parts: + lines.append("pod_mem_top: " + "; ".join(parts)) + if mem_top_node: + parts = [] + for entry in mem_top_node: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {} + namespace = metric.get("namespace") + pod = metric.get("pod") + node = metric.get("node") + value = entry.get("value") + if namespace and pod and node and value is not None: + parts.append(f"{node}:{namespace}/{pod}={_format_bytes(value)}") + if parts: + lines.append("pod_mem_top_node: " + "; ".join(parts)) + + +def _append_restarts(lines: list[str], summary: dict[str, Any]) -> None: + metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} + top_restarts = metrics.get("top_restarts_1h") or [] + if not isinstance(top_restarts, list) or not top_restarts: + top_restarts = [] + parts = [] + for entry in top_restarts: + metric = entry.get("metric") if isinstance(entry, dict) else {} + value = entry.get("value") if isinstance(entry, dict) else [] + if not isinstance(metric, dict) or not isinstance(value, list) or len(value) < _VALUE_PAIR_LEN: + continue + namespace = metric.get("namespace") + pod = metric.get("pod") + count = _format_float(value[1]) + if namespace and pod: + parts.append(f"{namespace}/{pod}={count}") + if parts: + lines.append("restarts_1h_top: " + "; ".join(parts)) + else: + lines.append("restarts_1h_top: none") + ns_top = metrics.get("restart_namespace_top") or [] + if isinstance(ns_top, list) and ns_top: + ns_parts = [] + for entry in ns_top: + metric = entry.get("metric") if isinstance(entry, dict) else {} + value = entry.get("value") + namespace = metric.get("namespace") if isinstance(metric, dict) else None + if namespace and value is not None: + ns_parts.append(f"{namespace}={_format_float(value)}") + if ns_parts: + lines.append("restarts_1h_namespace_top: " + "; ".join(ns_parts)) + else: + lines.append("restarts_1h_namespace_top: none") + + +def _append_job_failures(lines: list[str], summary: dict[str, Any]) -> None: + metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} + failures = metrics.get("job_failures_24h") if isinstance(metrics.get("job_failures_24h"), list) else [] + if not failures: + return + parts = [] + for entry in failures: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {} + namespace = metric.get("namespace") + job_name = metric.get("job_name") or metric.get("job") + value = entry.get("value") + if namespace and job_name and value is not None: + parts.append(f"{namespace}/{job_name}={_format_float(value)}") + if parts: + lines.append("job_failures_24h: " + "; ".join(parts)) + + +def _append_jobs(lines: list[str], summary: dict[str, Any]) -> None: + jobs = summary.get("jobs") if isinstance(summary.get("jobs"), dict) else {} + if not jobs: + return + totals_line = _format_jobs_totals(jobs) + if totals_line: + lines.append(totals_line) + failing_line = _format_jobs_failing(jobs) + if failing_line: + lines.append(failing_line) + active_line = _format_jobs_active_oldest(jobs) + if active_line: + lines.append(active_line) + + +def _format_jobs_totals(jobs: dict[str, Any]) -> str: + totals = jobs.get("totals") if isinstance(jobs.get("totals"), dict) else {} + if not totals: + return "" + return "jobs: total={total}, active={active}, failed={failed}, succeeded={succeeded}".format( + total=totals.get("total"), + active=totals.get("active"), + failed=totals.get("failed"), + succeeded=totals.get("succeeded"), + ) + + +def _format_jobs_failing(jobs: dict[str, Any]) -> str: + failing = jobs.get("failing") if isinstance(jobs.get("failing"), list) else [] + if not failing: + return "" + parts = [] + for item in failing[:5]: + if not isinstance(item, dict): + continue + namespace = item.get("namespace") + name = item.get("job") + failed = item.get("failed") + age = item.get("age_hours") + if namespace and name and failed is not None: + label = f"{namespace}/{name}={failed}" + if age is not None: + label = f"{label} ({_format_float(age)}h)" + parts.append(label) + return "jobs_failing_top: " + "; ".join(parts) if parts else "" + + +def _format_jobs_active_oldest(jobs: dict[str, Any]) -> str: + active_oldest = jobs.get("active_oldest") if isinstance(jobs.get("active_oldest"), list) else [] + if not active_oldest: + return "" + parts = [] + for item in active_oldest[:5]: + if not isinstance(item, dict): + continue + namespace = item.get("namespace") + name = item.get("job") + age = item.get("age_hours") + if namespace and name and age is not None: + parts.append(f"{namespace}/{name}={_format_float(age)}h") + return "jobs_active_oldest: " + "; ".join(parts) if parts else "" + + +def _append_postgres(lines: list[str], summary: dict[str, Any]) -> None: + postgres = summary.get("postgres") if isinstance(summary.get("postgres"), dict) else {} + if not postgres: + return + hottest = postgres.get("hottest_db") or "" + lines.append( + "postgres: used={used}, max={max}, hottest_db={hottest}".format( + used=postgres.get("used"), + max=postgres.get("max"), + hottest=hottest, + ) + ) + used = postgres.get("used") + max_conn = postgres.get("max") + if used is not None or max_conn is not None: + lines.append(f"postgres_connections_total: used={_format_float(used)}, max={_format_float(max_conn)}") + by_db = postgres.get("by_db") + if isinstance(by_db, list) and by_db: + parts = [] + for entry in by_db: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {} + value = entry.get("value") + if isinstance(value, list) and len(value) >= _VALUE_PAIR_LEN: + value = value[1] + name = metric.get("datname") if isinstance(metric, dict) else None + if name and value is not None: + parts.append(f"{name}={_format_float(value)}") + if parts: + lines.append("postgres_connections_by_db: " + "; ".join(parts)) + + +def _append_hottest(lines: list[str], summary: dict[str, Any]) -> None: + hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {} + if not hottest: + return + hardware_map = summary.get("hardware_by_node") + if not isinstance(hardware_map, dict): + hardware_map = {} + parts = [] + for key, entry in hottest.items(): + if not isinstance(entry, dict): + continue + node = entry.get("node") + hardware = hardware_map.get(node) if node else None + if key in {"net", "io"}: + value = _format_rate_bytes(entry.get("value")) + else: + value = _format_float(entry.get("value")) + if value and key in {"cpu", "ram", "disk"}: + value = f"{value}%" + if node: + label = node + if hardware: + label = f"{label} [{hardware}]" + parts.append(f"{key}={label} ({value})") + if parts: + lines.append("hottest: " + "; ".join(parts)) + + +def _append_workloads(lines: list[str], summary: dict[str, Any]) -> None: + workloads = summary.get("workloads") + if not isinstance(workloads, list) or not workloads: + return + lines.append(f"workloads: total={len(workloads)}") + top_workloads = sorted( + (item for item in workloads if isinstance(item, dict)), + key=lambda item: (-int(item.get("pods_total") or 0), item.get("workload") or ""), + )[:5] + if not top_workloads: + return + parts = [] + for item in top_workloads: + namespace = item.get("namespace") + name = item.get("workload") + pods_total = item.get("pods_total") + primary = item.get("primary_node") + if namespace and name: + label = f"{namespace}/{name}={pods_total}" + if primary: + label = f"{label} (primary={primary})" + parts.append(label) + if parts: + lines.append("workloads_top: " + "; ".join(parts)) + + +def _append_topology(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901 + topology = summary.get("topology") if isinstance(summary.get("topology"), dict) else {} + if not topology: + return + nodes = topology.get("nodes") if isinstance(topology.get("nodes"), list) else [] + workloads = topology.get("workloads") if isinstance(topology.get("workloads"), list) else [] + if nodes: + parts = [] + for entry in nodes[:5]: + if not isinstance(entry, dict): + continue + node = entry.get("node") + top = entry.get("workloads_top") if isinstance(entry.get("workloads_top"), list) else [] + if not node or not top: + continue + items = ", ".join([f"{name}({count})" for name, count in top if name and count is not None]) + if items: + parts.append(f"{node}={items}") + if parts: + lines.append("node_workloads_top: " + "; ".join(parts)) + if workloads: + parts = [] + for entry in workloads[:5]: + if not isinstance(entry, dict): + continue + namespace = entry.get("namespace") + name = entry.get("workload") + nodes_top = entry.get("nodes_top") if isinstance(entry.get("nodes_top"), list) else [] + if not namespace or not name: + continue + nodes_label = ", ".join([f"{node}:{count}" for node, count in nodes_top if node]) + label = f"{namespace}/{name}" + if nodes_label: + label = f"{label} [{nodes_label}]" + parts.append(label) + if parts: + lines.append("workload_nodes_top: " + "; ".join(parts)) + + +def _append_flux(lines: list[str], summary: dict[str, Any]) -> None: + flux = summary.get("flux") if isinstance(summary.get("flux"), dict) else {} + if not flux: + return + not_ready = flux.get("not_ready") + if not_ready is not None: + lines.append(f"flux_not_ready: {not_ready}") + items = flux.get("items") + if isinstance(items, list) and items: + parts = [] + for item in items[:10]: + if not isinstance(item, dict): + continue + name = item.get("name") or "" + namespace = item.get("namespace") or "" + reason = item.get("reason") or "" + suspended = item.get("suspended") + label = f"{namespace}/{name}".strip("/") + if reason: + label = f"{label} ({reason})" + if suspended: + label = f"{label} [suspended]" + if label: + parts.append(label) + if parts: + lines.append("flux_not_ready_items: " + "; ".join(parts)) + + +__all__ = [name for name in globals() if not name.startswith("__")] diff --git a/atlasbot/snapshot/builder/format_c.py b/atlasbot/snapshot/builder/format_c.py new file mode 100644 index 0000000..33c20f4 --- /dev/null +++ b/atlasbot/snapshot/builder/format_c.py @@ -0,0 +1,448 @@ +from __future__ import annotations + +from typing import Any + +from .core_a import PVC_USAGE_CRITICAL +from .format_b import * +def _append_signals(lines: list[str], summary: dict[str, Any]) -> None: + signals = summary.get("signals") if isinstance(summary.get("signals"), list) else [] + if not signals: + return + lines.append("signals:") + for entry in signals[:8]: + if not isinstance(entry, dict): + continue + scope = entry.get("scope") or "" + target = entry.get("target") or "" + metric = entry.get("metric") or "" + current = entry.get("current") + delta = entry.get("delta_pct") + severity = entry.get("severity") or "" + detail = f"{scope}:{target} {metric}={current}" + if delta is not None: + detail += f" delta={delta}%" + if severity: + detail += f" severity={severity}" + lines.append(f"- {detail}") + + +def _append_profiles(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901 + profiles = summary.get("profiles") if isinstance(summary.get("profiles"), dict) else {} + if not profiles: + return + nodes = profiles.get("nodes") if isinstance(profiles.get("nodes"), list) else [] + namespaces = profiles.get("namespaces") if isinstance(profiles.get("namespaces"), list) else [] + workloads = profiles.get("workloads") if isinstance(profiles.get("workloads"), list) else [] + if nodes: + lines.append("node_profiles:") + for entry in nodes[:3]: + if not isinstance(entry, dict): + continue + lines.append( + f"- {entry.get('node')}: load={entry.get('load_index')} cpu={entry.get('cpu')} ram={entry.get('ram')} " + f"pods={entry.get('pods_total')} hw={entry.get('hardware')}" + ) + if namespaces: + lines.append("namespace_profiles:") + for entry in namespaces[:3]: + if not isinstance(entry, dict): + continue + lines.append( + f"- {entry.get('namespace')}: pods={entry.get('pods_total')} cpu={entry.get('cpu_usage')} " + f"mem={entry.get('mem_usage')} primary={entry.get('primary_node')}" + ) + if workloads: + lines.append("workload_profiles:") + for entry in workloads[:3]: + if not isinstance(entry, dict): + continue + lines.append( + f"- {entry.get('namespace')}/{entry.get('workload')}: pods={entry.get('pods_total')} " + f"running={entry.get('pods_running')} node={entry.get('primary_node')}" + ) + + +def _append_units_windows(lines: list[str], summary: dict[str, Any]) -> None: + metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} + units = metrics.get("units") if isinstance(metrics.get("units"), dict) else {} + windows = metrics.get("windows") if isinstance(metrics.get("windows"), dict) else {} + if units: + lines.append("units: " + _format_kv_map(units)) + else: + lines.append("units: cpu_pct, ram_pct, net=bytes_per_sec, io=bytes_per_sec") + if windows: + lines.append("windows: " + _format_kv_map(windows)) + else: + lines.append("windows: rates=5m, restarts=1h") + + +def _append_node_load_summary(lines: list[str], summary: dict[str, Any]) -> None: + node_load = summary.get("node_load_summary") + if not isinstance(node_load, dict) or not node_load: + return + hardware_by_node = summary.get("hardware_by_node") + hardware_by_node = hardware_by_node if isinstance(hardware_by_node, dict) else {} + top = node_load.get("top") + if isinstance(top, list) and top: + parts = [] + for entry in top[:5]: + if not isinstance(entry, dict): + continue + node = entry.get("node") or "" + load = entry.get("load_index") + cpu = entry.get("cpu") + ram = entry.get("ram") + io = entry.get("io") + net = entry.get("net") + pods_total = entry.get("pods_total") + label = f"{node} idx={_format_float(load)}" + if node and node in hardware_by_node: + label += f" hw={hardware_by_node.get(node)}" + if isinstance(pods_total, (int, float)): + label += f" pods={int(pods_total)}" + label += f" cpu={_format_float(cpu)} ram={_format_float(ram)}" + label += f" io={_format_rate_bytes(io)} net={_format_rate_bytes(net)}" + parts.append(label) + if parts: + lines.append("node_load_top: " + "; ".join(parts)) + outliers = node_load.get("outliers") + if isinstance(outliers, list) and outliers: + names = [entry.get("node") for entry in outliers if isinstance(entry, dict)] + names = [name for name in names if isinstance(name, str) and name] + if names: + lines.append("node_load_outliers: " + _format_names(names)) + + +def _append_hardware_usage(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901 + usage = summary.get("hardware_usage_avg") + if not isinstance(usage, list) or not usage: + return + parts = [] + tops: dict[str, tuple[str, float]] = {} + for entry in usage[:5]: + if not isinstance(entry, dict): + continue + hardware = entry.get("hardware") + load = entry.get("load_index") + cpu = entry.get("cpu") + ram = entry.get("ram") + io = entry.get("io") + net = entry.get("net") + if not hardware: + continue + label = f"{hardware} idx={_format_float(load)}" + label += f" cpu={_format_float(cpu)} ram={_format_float(ram)}" + label += f" io={_format_rate_bytes(io)} net={_format_rate_bytes(net)}" + parts.append(label) + for metric, value in (("cpu", cpu), ("ram", ram), ("io", io), ("net", net), ("load", load)): + if isinstance(value, (int, float)): + current = tops.get(metric) + if current is None or float(value) > current[1]: + tops[metric] = (hardware, float(value)) + if parts: + lines.append("hardware_usage_avg: " + "; ".join(parts)) + if tops: + top_parts = [] + for metric in ("cpu", "ram", "io", "net", "load"): + entry = tops.get(metric) + if not entry: + continue + hardware, value = entry + if metric in {"io", "net"}: + rendered = _format_rate_bytes(value) + else: + rendered = _format_float(value) + top_parts.append(f"{metric}={hardware} ({rendered})") + if top_parts: + lines.append("hardware_usage_top: " + "; ".join(top_parts)) + + +def _append_cluster_watchlist(lines: list[str], summary: dict[str, Any]) -> None: + watchlist = summary.get("cluster_watchlist") + if not isinstance(watchlist, list) or not watchlist: + return + lines.append("cluster_watchlist: " + "; ".join(watchlist)) + + +def _append_baseline_deltas(lines: list[str], summary: dict[str, Any]) -> None: + deltas = summary.get("baseline_deltas") if isinstance(summary.get("baseline_deltas"), dict) else {} + nodes = deltas.get("nodes") if isinstance(deltas.get("nodes"), dict) else {} + namespaces = deltas.get("namespaces") if isinstance(deltas.get("namespaces"), dict) else {} + for scope, block in (("nodes", nodes), ("namespaces", namespaces)): + if not isinstance(block, dict): + continue + for metric, entries in block.items(): + if not isinstance(entries, list) or not entries: + continue + parts: list[str] = [] + for entry in entries[:5]: + if not isinstance(entry, dict): + continue + name = entry.get("node") if scope == "nodes" else entry.get("namespace") + delta = entry.get("delta") + severity = entry.get("severity") + if not isinstance(name, str) or not name or not isinstance(delta, (int, float)): + continue + suffix = f" ({severity})" if isinstance(severity, str) and severity else "" + parts.append(f"{name}={_format_float(delta)}%{suffix}") + if parts: + lines.append(f"{scope}_baseline_delta_{metric}: " + "; ".join(parts)) + + +def _append_pod_issue_summary(lines: list[str], summary: dict[str, Any]) -> None: + issues = summary.get("pod_issue_summary") if isinstance(summary.get("pod_issue_summary"), dict) else {} + waiting = issues.get("waiting_reasons_top") if isinstance(issues.get("waiting_reasons_top"), list) else [] + phases = issues.get("phase_reasons_top") if isinstance(issues.get("phase_reasons_top"), list) else [] + namespace_issue = issues.get("namespace_issue_top") if isinstance(issues.get("namespace_issue_top"), dict) else {} + waiting_line = _reason_line(waiting, "pod_waiting_reasons_top") + if waiting_line: + lines.append(waiting_line) + phase_line = _reason_line(phases, "pod_phase_reasons_top") + if phase_line: + lines.append(phase_line) + if namespace_issue: + _append_namespace_issue_lines(lines, namespace_issue) + + +def _reason_line(entries: list[dict[str, Any]], label: str) -> str: + parts = [] + for entry in entries[:5]: + if not isinstance(entry, dict): + continue + reason = entry.get("reason") + count = entry.get("count") + if reason: + parts.append(f"{reason}={count}") + if parts: + return f"{label}: " + "; ".join(parts) + return "" + + +def _append_namespace_issue_lines(lines: list[str], namespace_issue: dict[str, Any]) -> None: + for key, entries in namespace_issue.items(): + if not isinstance(entries, list) or not entries: + continue + parts: list[str] = [] + for entry in entries[:5]: + if not isinstance(entry, dict): + continue + ns = entry.get("namespace") + value = entry.get("value") + if ns: + parts.append(f"{ns}={value}") + if parts: + lines.append(f"namespace_issue_top_{key}: " + "; ".join(parts)) + + +def _build_cluster_watchlist(summary: dict[str, Any]) -> dict[str, Any]: + items: list[str] = [] + nodes_summary = summary.get("nodes_summary") if isinstance(summary.get("nodes_summary"), dict) else {} + not_ready = int(nodes_summary.get("not_ready") or 0) + if not_ready > 0: + items.append(f"not_ready_nodes={not_ready}") + pressure = summary.get("pressure_nodes") if isinstance(summary.get("pressure_nodes"), dict) else {} + pressure_nodes = pressure.get("names") if isinstance(pressure.get("names"), list) else [] + if pressure_nodes: + items.append(f"pressure_nodes={len(pressure_nodes)}") + pod_issues = summary.get("pod_issues") if isinstance(summary.get("pod_issues"), dict) else {} + pending_over = int(pod_issues.get("pending_over_15m") or 0) + if pending_over > 0: + items.append(f"pods_pending_over_15m={pending_over}") + workloads = summary.get("workloads_health") if isinstance(summary.get("workloads_health"), dict) else {} + deployments = workloads.get("deployments") if isinstance(workloads.get("deployments"), dict) else {} + statefulsets = workloads.get("statefulsets") if isinstance(workloads.get("statefulsets"), dict) else {} + daemonsets = workloads.get("daemonsets") if isinstance(workloads.get("daemonsets"), dict) else {} + total_not_ready = int(deployments.get("not_ready") or 0) + int(statefulsets.get("not_ready") or 0) + int(daemonsets.get("not_ready") or 0) + if total_not_ready > 0: + items.append(f"workloads_not_ready={total_not_ready}") + flux = summary.get("flux") if isinstance(summary.get("flux"), dict) else {} + flux_not_ready = int(flux.get("not_ready") or 0) + if flux_not_ready > 0: + items.append(f"flux_not_ready={flux_not_ready}") + pvc_usage = summary.get("pvc_usage_top") if isinstance(summary.get("pvc_usage_top"), list) else [] + high_pvc = [ + entry for entry in pvc_usage if isinstance(entry, dict) and (entry.get("value") or 0) >= PVC_USAGE_CRITICAL + ] + if high_pvc: + items.append(f"pvc_usage>={PVC_USAGE_CRITICAL}%") + return {"cluster_watchlist": items} if items else {} + + +def _capacity_ratio_parts(entries: list[dict[str, Any]], ratio_key: str, usage_key: str, req_key: str) -> list[str]: + parts: list[str] = [] + for entry in entries[:5]: + if not isinstance(entry, dict): + continue + ns = entry.get("namespace") or "" + ratio = entry.get(ratio_key) + usage = entry.get(usage_key) + req = entry.get(req_key) + if ns: + parts.append( + f"{ns}={_format_float(ratio)} (usage={_format_float(usage)} req={_format_float(req)})" + ) + return parts + + +def _capacity_headroom_parts(entries: list[dict[str, Any]]) -> list[str]: + parts: list[str] = [] + for entry in entries[:5]: + if not isinstance(entry, dict): + continue + ns = entry.get("namespace") or "" + headroom = entry.get("headroom") + if ns: + parts.append(f"{ns}={_format_float(headroom)}") + return parts + + +def _append_namespace_capacity_summary( # noqa: C901 + lines: list[str], + summary: dict[str, Any], +) -> None: + cap = summary.get("namespace_capacity_summary") + if not isinstance(cap, dict) or not cap: + return + cpu_ratio = cap.get("cpu_ratio_top") + if isinstance(cpu_ratio, list): + parts = _capacity_ratio_parts(cpu_ratio, "cpu_usage_ratio", "cpu_usage", "cpu_requests") + if parts: + lines.append("namespace_cpu_ratio_top: " + "; ".join(parts)) + mem_ratio = cap.get("mem_ratio_top") + if isinstance(mem_ratio, list): + parts = _capacity_ratio_parts(mem_ratio, "mem_usage_ratio", "mem_usage", "mem_requests") + if parts: + lines.append("namespace_mem_ratio_top: " + "; ".join(parts)) + cpu_headroom = cap.get("cpu_headroom_low") + if isinstance(cpu_headroom, list): + parts = _capacity_headroom_parts(cpu_headroom) + if parts: + lines.append("namespace_cpu_headroom_low: " + "; ".join(parts)) + mem_headroom = cap.get("mem_headroom_low") + if isinstance(mem_headroom, list): + parts = _capacity_headroom_parts(mem_headroom) + if parts: + lines.append("namespace_mem_headroom_low: " + "; ".join(parts)) + cpu_over = cap.get("cpu_overcommitted") + mem_over = cap.get("mem_overcommitted") + if cpu_over is not None or mem_over is not None: + lines.append(f"namespace_overcommitted: cpu={cpu_over} mem={mem_over}") + cpu_over_names = cap.get("cpu_overcommitted_names") + if isinstance(cpu_over_names, list) and cpu_over_names: + names = [name for name in cpu_over_names if isinstance(name, str) and name] + if names: + lines.append("namespace_cpu_overcommitted_names: " + _format_names(names)) + mem_over_names = cap.get("mem_overcommitted_names") + if isinstance(mem_over_names, list) and mem_over_names: + names = [name for name in mem_over_names if isinstance(name, str) and name] + if names: + lines.append("namespace_mem_overcommitted_names: " + _format_names(names)) + + +def _append_workloads_by_namespace(lines: list[str], summary: dict[str, Any]) -> None: + workloads = summary.get("workloads") + if not isinstance(workloads, list) or not workloads: + return + by_ns: dict[str, list[dict[str, Any]]] = {} + for item in workloads: + if not isinstance(item, dict): + continue + ns = item.get("namespace") or "" + name = item.get("workload") or "" + if not ns or not name: + continue + by_ns.setdefault(ns, []).append(item) + for ns, items in sorted(by_ns.items()): + items.sort( + key=lambda item: (-int(item.get("pods_total") or 0), item.get("workload") or "") + ) + parts = [] + for entry in items[:2]: + name = entry.get("workload") or "" + pods = entry.get("pods_total") + primary = entry.get("primary_node") + label = f"{name}({pods})" if pods is not None else name + if primary: + label = f"{label}@{primary}" + if label: + parts.append(label) + if parts: + lines.append(f"workloads_top_{ns}: " + "; ".join(parts)) + + +def _append_lexicon(lines: list[str], summary: dict[str, Any]) -> None: + lexicon = summary.get("lexicon") + if not isinstance(lexicon, dict): + return + terms = lexicon.get("terms") if isinstance(lexicon.get("terms"), list) else [] + aliases = lexicon.get("aliases") if isinstance(lexicon.get("aliases"), dict) else {} + for entry in terms[:8]: + if not isinstance(entry, dict): + continue + term = entry.get("term") + meaning = entry.get("meaning") + if term and meaning: + lines.append(f"lexicon_term: {term} => {meaning}") + for key, value in list(aliases.items())[:6]: + if key and value: + lines.append(f"lexicon_alias: {key} => {value}") + + +def _append_cross_stats(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901 + cross_stats = summary.get("cross_stats") + if not isinstance(cross_stats, dict): + return + node_entries = cross_stats.get("node_metric_top") if isinstance(cross_stats.get("node_metric_top"), list) else [] + for entry in node_entries[:10]: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") + node = entry.get("node") + value = entry.get("value") + cpu = entry.get("cpu") + ram = entry.get("ram") + net = entry.get("net") + io = entry.get("io") + pods = entry.get("pods_total") + if metric and node: + parts = [ + f"value={_format_float(value)}", + f"cpu={_format_float(cpu)}", + f"ram={_format_float(ram)}", + f"net={_format_float(net)}", + f"io={_format_float(io)}", + ] + if pods is not None: + parts.append(f"pods={pods}") + lines.append(f"cross_node_{metric}: {node} " + " ".join(parts)) + ns_entries = cross_stats.get("namespace_metric_top") if isinstance(cross_stats.get("namespace_metric_top"), list) else [] + for entry in ns_entries[:10]: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") + namespace = entry.get("namespace") + value = entry.get("value") + pods = entry.get("pods_total") + cpu_ratio = entry.get("cpu_ratio") + mem_ratio = entry.get("mem_ratio") + if metric and namespace: + parts = [ + f"value={_format_float(value)}", + f"cpu_ratio={_format_float(cpu_ratio)}", + f"mem_ratio={_format_float(mem_ratio)}", + ] + if pods is not None: + parts.append(f"pods={pods}") + lines.append(f"cross_namespace_{metric}: {namespace} " + " ".join(parts)) + pvc_entries = cross_stats.get("pvc_top") if isinstance(cross_stats.get("pvc_top"), list) else [] + for entry in pvc_entries[:5]: + if not isinstance(entry, dict): + continue + namespace = entry.get("namespace") + pvc = entry.get("pvc") + used = entry.get("used_percent") + if namespace and pvc: + lines.append(f"cross_pvc_usage: {namespace}/{pvc} used={_format_float(used)}") + + +__all__ = [name for name in globals() if not name.startswith("__")] diff --git a/atlasbot/snapshot/builder/summary_text.py b/atlasbot/snapshot/builder/summary_text.py new file mode 100644 index 0000000..9072bd7 --- /dev/null +++ b/atlasbot/snapshot/builder/summary_text.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +from typing import Any + +from .core_a import * +from .core_b import * +from .format_a import * +from .format_b import * +from .format_c import * + + +def summary_text(snapshot: dict[str, Any] | None) -> str: + """Render the snapshot summary into deterministic prompt text.""" + + summary = build_summary(snapshot) + if not summary: + return "" + lines: list[str] = [] + lines.append("atlas_cluster: Titan Lab Atlas Kubernetes cluster (internal).") + collected_at = snapshot.get("collected_at") if isinstance(snapshot, dict) else None + snapshot_version = snapshot.get("snapshot_version") if isinstance(snapshot, dict) else None + if collected_at or snapshot_version: + bits = [] + if collected_at: + bits.append(f"collected_at={collected_at}") + if snapshot_version: + bits.append(f"version={snapshot_version}") + lines.append("snapshot: " + ", ".join(bits)) + _append_nodes(lines, summary) + _append_hardware(lines, summary) + _append_hardware_groups(lines, summary) + _append_lexicon(lines, summary) + _append_pressure(lines, summary) + _append_node_facts(lines, summary) + _append_node_ages(lines, summary) + _append_node_taints(lines, summary) + _append_capacity(lines, summary) + _append_pods(lines, summary) + _append_namespace_pods(lines, summary) + _append_namespace_nodes(lines, summary) + _append_node_pods(lines, summary) + _append_pod_issues(lines, summary) + _append_pod_issue_summary(lines, summary) + _append_workload_health(lines, summary) + _append_events(lines, summary) + _append_node_usage_stats(lines, summary) + _append_namespace_usage(lines, summary) + _append_namespace_requests(lines, summary) + _append_namespace_io_net(lines, summary) + _append_pod_usage(lines, summary) + _append_restarts(lines, summary) + _append_job_failures(lines, summary) + _append_jobs(lines, summary) + _append_postgres(lines, summary) + _append_hottest(lines, summary) + _append_pvc_usage(lines, summary) + _append_root_disk_headroom(lines, summary) + _append_namespace_capacity_summary(lines, summary) + _append_baseline_deltas(lines, summary) + _append_longhorn(lines, summary) + _append_workloads(lines, summary) + _append_topology(lines, summary) + _append_workloads_by_namespace(lines, summary) + _append_node_load_summary(lines, summary) + _append_cluster_watchlist(lines, summary) + _append_hardware_usage(lines, summary) + _append_cross_stats(lines, summary) + _append_flux(lines, summary) + _append_signals(lines, summary) + _append_profiles(lines, summary) + _append_units_windows(lines, summary) + return "\n".join(lines) diff --git a/atlasbot/state/store.py b/atlasbot/state/store.py index 6c9c408..2eff9eb 100644 --- a/atlasbot/state/store.py +++ b/atlasbot/state/store.py @@ -6,6 +6,17 @@ from typing import Any class ClaimStore: + """Persist conversation claims for follow-up answers. + + Why: + - keep short-lived conversation state durable across turns without + forcing the answer engine to own storage mechanics. + + Input/Output: + - accepts a SQLite path and TTL, stores claim payloads, and returns + normalized payload dictionaries when queried. + """ + def __init__(self, path: str, ttl_sec: int) -> None: self._path = path or ":memory:" self._ttl = max(60, ttl_sec) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..10d8362 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,21 @@ +[tool.pytest.ini_options] +testpaths = ["tests", "testing"] +pythonpath = ["."] + +[tool.ruff] +line-length = 100 +target-version = "py312" + +[tool.ruff.lint] +select = ["E", "F", "W", "B", "C90", "I", "PLR", "RUF", "SIM", "UP", "ARG"] +ignore = ["E501"] + +[tool.ruff.lint.per-file-ignores] +"atlasbot/engine/answerer/*.py" = ["F403", "F405", "I001"] +"atlasbot/engine/answerer/__init__.py" = ["C90", "PLR", "SIM", "ARG", "RUF", "UP", "I001"] +"atlasbot/matrix/bot.py" = ["C90", "PLR", "SIM", "ARG", "RUF", "UP", "I001"] +"atlasbot/snapshot/builder/__init__.py" = ["F403", "F405", "I001"] +"atlasbot/snapshot/builder/*.py" = ["F403", "F405", "I001"] +"testing/*.py" = ["PLR0911", "ARG002", "PLR2004"] +"tests/*.py" = ["PLR2004", "I001", "ARG001", "ARG002", "ARG005", "C901", "PLR0915", "UP037"] +"scripts/*.py" = ["PLR0911", "PLR2004"] diff --git a/scripts/check_coverage.py b/scripts/check_coverage.py new file mode 100755 index 0000000..727057c --- /dev/null +++ b/scripts/check_coverage.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +"""Enforce per-file coverage thresholds from SlipCover JSON output.""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + + +def main() -> int: + """Check each production file against a minimum coverage percentage.""" + + parser = argparse.ArgumentParser() + parser.add_argument("coverage_json") + parser.add_argument("--root", default="atlasbot") + parser.add_argument("--threshold", type=float, default=95.0) + args = parser.parse_args() + + data = json.loads(Path(args.coverage_json).read_text(encoding="utf-8")) + files = data.get("files") if isinstance(data, dict) else {} + violations: list[tuple[float, str]] = [] + for path, payload in sorted(files.items()): + if not path.startswith(f"{args.root}/"): + continue + summary = payload.get("summary") if isinstance(payload, dict) else {} + percent = summary.get("percent_covered") if isinstance(summary, dict) else None + if not isinstance(percent, (int, float)): + continue + if float(percent) < args.threshold: + violations.append((float(percent), path)) + + if violations: + for percent, path in sorted(violations): + print(f"{path}: {percent:.2f}% < {args.threshold:.2f}%") + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/scripts/check_docstrings.py b/scripts/check_docstrings.py new file mode 100755 index 0000000..0eb5639 --- /dev/null +++ b/scripts/check_docstrings.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +"""Require docstrings on public production APIs.""" + +from __future__ import annotations + +import argparse +import ast +from pathlib import Path + + +def _needs_docstring(node: ast.AST, *, parent_class: str | None = None) -> bool: + """Decide whether `node` should carry a contract docstring.""" + + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + name = node.name + if name.startswith("_") and name != "__init__": + return False + return not (parent_class and name.startswith("_")) + if isinstance(node, ast.ClassDef): + if node.name.startswith("_"): + return False + if any( + (isinstance(dec, ast.Name) and dec.id == "dataclass") + or (isinstance(dec, ast.Call) and isinstance(dec.func, ast.Name) and dec.func.id == "dataclass") + for dec in node.decorator_list + ): + return False + if any( + isinstance(base, ast.Name) and base.id in {"Exception", "RuntimeError", "BaseException"} + for base in node.bases + ): + return False + return not any(isinstance(base, ast.Name) and base.id == "BaseModel" for base in node.bases) + return False + + +def _iter_nodes(tree: ast.AST) -> list[tuple[ast.AST, str | None]]: + """Yield top-level public nodes only. + + The gate focuses on the module surface area rather than every internal + method so we can keep contracts on the actual API seams. + """ + + items: list[tuple[ast.AST, str | None]] = [] + for node in getattr(tree, "body", []): + items.append((node, None)) + return items + + +def main() -> int: + """Check modules under the production package and report missing contracts.""" + + parser = argparse.ArgumentParser() + parser.add_argument("--root", default="atlasbot") + args = parser.parse_args() + + root = Path(args.root) + violations: list[str] = [] + for path in sorted(root.rglob("*.py")): + if "__pycache__" in path.parts or ".venv" in path.parts: + continue + tree = ast.parse(path.read_text(encoding="utf-8")) + for node, parent_class in _iter_nodes(tree): + if not _needs_docstring(node, parent_class=parent_class): + continue + doc = ast.get_docstring(node) + if doc: + continue + if isinstance(node, ast.ClassDef): + violations.append(f"{path}: class {node.name} is missing a docstring") + elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + owner = f"{parent_class}." if parent_class else "" + violations.append(f"{path}: {owner}{node.name} is missing a docstring") + + if violations: + for item in violations: + print(item) + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/check_file_sizes.py b/scripts/check_file_sizes.py new file mode 100755 index 0000000..d86f204 --- /dev/null +++ b/scripts/check_file_sizes.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +"""Fail when production Python files exceed the configured line budget. + +The gate is intentionally narrow: +- it only checks the `atlasbot/` package tree; +- it treats each file independently; +- it keeps the threshold explicit so CI can ratchet without guesswork. +""" + +from __future__ import annotations + +import argparse +from pathlib import Path + + +def _count_lines(path: Path) -> int: + """Return the physical line count for `path`. + + Input: + - `path`: a readable Python source file. + + Output: + - The number of newline-delimited lines in the file. + """ + + return len(path.read_text(encoding="utf-8").splitlines()) + + +def _iter_python_files(root: Path) -> list[Path]: + """List production Python files under `root`. + + Input: + - `root`: repository package root to scan. + + Output: + - Sorted Python file paths, excluding bytecode and hidden caches. + """ + + return sorted( + path + for path in root.rglob("*.py") + if path.is_file() and "__pycache__" not in path.parts and ".venv" not in path.parts + ) + + +def main() -> int: + """Run the size gate and return a process exit code.""" + + parser = argparse.ArgumentParser() + parser.add_argument("--root", default="atlasbot") + parser.add_argument("--max-lines", type=int, default=500) + args = parser.parse_args() + + root = Path(args.root) + violations: list[tuple[int, Path]] = [] + for path in _iter_python_files(root): + lines = _count_lines(path) + if lines > args.max_lines: + violations.append((lines, path)) + + if violations: + for lines, path in sorted(violations, reverse=True): + print(f"{path}: {lines} lines (limit {args.max_lines})") + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/testing/__init__.py b/testing/__init__.py new file mode 100644 index 0000000..99e3152 --- /dev/null +++ b/testing/__init__.py @@ -0,0 +1,2 @@ +"""Shared testing helpers for atlasbot.""" + diff --git a/testing/fakes.py b/testing/fakes.py new file mode 100644 index 0000000..5d3805c --- /dev/null +++ b/testing/fakes.py @@ -0,0 +1,108 @@ +"""Reusable test doubles and settings factories.""" + +from __future__ import annotations + +import asyncio + +from atlasbot.config import Settings + + +class FakeLLM: + """Deterministic LLM double for pipeline tests. + + Why: + - keeps the answer engine tests fast and predictable. + + Input/Output: + - accepts the same `chat()` signature as the real client; + - returns canned JSON or text snippets based on the prompt content. + """ + + def __init__(self) -> None: + self.calls: list[str] = [] + + async def chat(self, messages, *, model=None, timeout_sec=None): + """Return a prompt-shaped response and remember the last user prompt.""" + + prompt = messages[-1]["content"] + self.calls.append(prompt) + if "normalized" in prompt and "keywords" in prompt: + return '{"normalized":"What is Atlas?","keywords":["atlas"]}' + if "needs_snapshot" in prompt: + return '{"needs_snapshot": true, "answer_style": "direct"}' + if "sub-questions" in prompt: + return '[{"id":"q1","question":"What is Atlas?","priority":1}]' + if "sub-question" in prompt: + return "Atlas has 22 nodes." + if "Answer using only the Fact Sheet" in prompt: + return "Atlas has 22 nodes." + if "final response" in prompt: + return "Atlas has 22 nodes." + if "Score response quality" in prompt: + return '{"confidence":80,"relevance":90,"satisfaction":85,"hallucination_risk":"low"}' + if "claims list" in prompt: + return '{"claims": []}' + return "{}" + + +class SlowFakeLLM(FakeLLM): + """Variant that sleeps briefly so timeout guards can be exercised.""" + + async def chat(self, messages, *, model=None, timeout_sec=None): + """Delay before answering to make budget handling deterministic.""" + + await asyncio.sleep(0.02) + return await super().chat(messages, model=model, timeout_sec=timeout_sec) + + +def build_test_settings() -> Settings: + """Create a fully populated `Settings` instance for unit tests.""" + + return Settings( + matrix_base="", + auth_base="", + bot_user="", + bot_pass="", + room_alias="", + server_name="", + bot_mentions=(), + matrix_bots=(), + ollama_url="", + ollama_model="base", + ollama_model_fast="fast", + ollama_model_smart="smart", + ollama_model_genius="genius", + ollama_fallback_model="", + ollama_timeout_sec=1.0, + ollama_retries=0, + ollama_api_key="", + http_port=8090, + internal_token="", + kb_dir="", + vm_url="", + ariadne_state_url="", + ariadne_state_token="", + snapshot_ttl_sec=30, + thinking_interval_sec=30, + quick_time_budget_sec=15.0, + smart_time_budget_sec=45.0, + genius_time_budget_sec=180.0, + conversation_ttl_sec=300, + snapshot_pin_enabled=False, + queue_enabled=False, + nats_url="", + nats_stream="", + nats_subject="", + nats_result_bucket="", + fast_max_angles=1, + smart_max_angles=1, + genius_max_angles=1, + fast_max_candidates=1, + smart_max_candidates=1, + genius_max_candidates=1, + fast_llm_calls_max=9, + smart_llm_calls_max=17, + genius_llm_calls_max=32, + llm_limit_multiplier=1.5, + state_db_path="/tmp/atlasbot_test_state.db", + ) diff --git a/tests/test_engine.py b/tests/test_engine.py index 9d92251..b3f0617 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -1,98 +1,21 @@ +"""Answer-engine regression tests.""" + +from __future__ import annotations + import asyncio from dataclasses import replace from atlasbot.engine.answerer import AnswerEngine from atlasbot.knowledge.loader import KnowledgeBase from atlasbot.snapshot.builder import SnapshotProvider -from atlasbot.config import Settings +from testing.fakes import FakeLLM, SlowFakeLLM, build_test_settings -class FakeLLM: - def __init__(self) -> None: - self.calls: list[str] = [] +def test_engine_answer_basic() -> None: + """The quick path should answer from the fact sheet.""" - async def chat(self, messages, *, model=None, timeout_sec=None): - prompt = messages[-1]["content"] - self.calls.append(prompt) - if "normalized" in prompt and "keywords" in prompt: - return '{"normalized":"What is Atlas?","keywords":["atlas"]}' - if "needs_snapshot" in prompt: - return '{"needs_snapshot": true, "answer_style": "direct"}' - if "sub-questions" in prompt: - return '[{"id":"q1","question":"What is Atlas?","priority":1}]' - if "sub-question" in prompt: - return "Atlas has 22 nodes." - if "Answer using only the Fact Sheet" in prompt: - return "Atlas has 22 nodes." - if "final response" in prompt: - return "Atlas has 22 nodes." - if "Score response quality" in prompt: - return '{"confidence":80,"relevance":90,"satisfaction":85,"hallucination_risk":"low"}' - if "claims list" in prompt: - return '{"claims": []}' - return "{}" - - -class SlowFakeLLM(FakeLLM): - async def chat(self, messages, *, model=None, timeout_sec=None): - await asyncio.sleep(0.02) - return await super().chat(messages, model=model, timeout_sec=timeout_sec) - - -def _settings() -> Settings: - return Settings( - matrix_base="", - auth_base="", - bot_user="", - bot_pass="", - room_alias="", - server_name="", - bot_mentions=(), - matrix_bots=(), - ollama_url="", - ollama_model="base", - ollama_model_fast="fast", - ollama_model_smart="smart", - ollama_model_genius="genius", - ollama_fallback_model="", - ollama_timeout_sec=1.0, - ollama_retries=0, - ollama_api_key="", - http_port=8090, - internal_token="", - kb_dir="", - vm_url="", - ariadne_state_url="", - ariadne_state_token="", - snapshot_ttl_sec=30, - thinking_interval_sec=30, - quick_time_budget_sec=15.0, - smart_time_budget_sec=45.0, - genius_time_budget_sec=180.0, - conversation_ttl_sec=300, - snapshot_pin_enabled=False, - queue_enabled=False, - nats_url="", - nats_stream="", - nats_subject="", - nats_result_bucket="", - fast_max_angles=1, - smart_max_angles=1, - genius_max_angles=1, - fast_max_candidates=1, - smart_max_candidates=1, - genius_max_candidates=1, - fast_llm_calls_max=9, - smart_llm_calls_max=17, - genius_llm_calls_max=32, - llm_limit_multiplier=1.5, - state_db_path="/tmp/atlasbot_test_state.db", - ) - - -def test_engine_answer_basic(): llm = FakeLLM() - settings = _settings() + settings = build_test_settings() kb = KnowledgeBase("") snapshot = SnapshotProvider(settings) engine = AnswerEngine(settings, llm, kb, snapshot) @@ -101,9 +24,11 @@ def test_engine_answer_basic(): assert "Atlas has 22 nodes" in result.reply -def test_smart_mode_uses_factsheet_path(): +def test_smart_mode_uses_factsheet_path() -> None: + """Smart mode should stay on the factsheet branch for direct cluster questions.""" + llm = FakeLLM() - settings = _settings() + settings = build_test_settings() kb = KnowledgeBase("") snapshot = SnapshotProvider(settings) engine = AnswerEngine(settings, llm, kb, snapshot) @@ -113,9 +38,11 @@ def test_smart_mode_uses_factsheet_path(): assert "time budget" not in result.reply.lower() -def test_genius_mode_uses_factsheet_path(): +def test_genius_mode_uses_factsheet_path() -> None: + """Genius mode should also return the factsheet answer for the same query.""" + llm = FakeLLM() - settings = _settings() + settings = build_test_settings() kb = KnowledgeBase("") snapshot = SnapshotProvider(settings) engine = AnswerEngine(settings, llm, kb, snapshot) @@ -125,9 +52,11 @@ def test_genius_mode_uses_factsheet_path(): assert "time budget" not in result.reply.lower() -def test_plain_math_question_is_rejected_for_cluster_modes(): +def test_plain_math_question_is_rejected_for_cluster_modes() -> None: + """The bot should keep users on cluster questions instead of generic math.""" + llm = FakeLLM() - settings = _settings() + settings = build_test_settings() kb = KnowledgeBase("") snapshot = SnapshotProvider(settings) engine = AnswerEngine(settings, llm, kb, snapshot) @@ -136,9 +65,11 @@ def test_plain_math_question_is_rejected_for_cluster_modes(): assert "focus on Titan cluster operations" in result.reply -def test_quick_mode_time_budget_guard(): +def test_quick_mode_time_budget_guard() -> None: + """A slow model call should trip the quick-mode budget guard.""" + llm = SlowFakeLLM() - settings = replace(_settings(), quick_time_budget_sec=0.01) + settings = replace(build_test_settings(), quick_time_budget_sec=0.01) kb = KnowledgeBase("") snapshot = SnapshotProvider(settings) engine = AnswerEngine(settings, llm, kb, snapshot) diff --git a/tests/test_quality_gate_paths.py b/tests/test_quality_gate_paths.py new file mode 100644 index 0000000..b04dfda --- /dev/null +++ b/tests/test_quality_gate_paths.py @@ -0,0 +1,810 @@ +"""Targeted quality-gate coverage for runtime and answerer orchestration.""" + +from __future__ import annotations + +import asyncio +import json +from dataclasses import replace +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import httpx +import pytest + +from atlasbot.api.http import Api, AnswerRequest +from atlasbot.config import MatrixBotConfig +from atlasbot.engine.answerer import ( + AnswerEngine, + AnswerResult, + AnswerScores, + ClaimItem, + EvidenceItem, + ModePlan, +) +from atlasbot.engine.answerer.common import _mode_plan +from atlasbot.engine.answerer.engine import AnswerEngine as EngineClass +from atlasbot.engine.answerer.workflow import run_answer +from atlasbot.engine.answerer.workflow_post import finalize_answer +from atlasbot.knowledge.loader import KnowledgeBase +from atlasbot.llm.client import LLMClient, LLMError, parse_json +from atlasbot.main import result_scores +from atlasbot.matrix.bot import MatrixBot, MatrixClient +from atlasbot.queue.nats import QueueManager +from atlasbot.snapshot.builder import SnapshotProvider, build_summary +from testing.fakes import build_test_settings +from tests.test_support_modules import _rich_snapshot + + +class StaticSnapshot: + """Return a fixed snapshot for answer-engine tests.""" + + def __init__(self, payload: dict[str, Any]) -> None: + self._payload = payload + + def get(self) -> dict[str, Any]: + """Return the stored snapshot payload.""" + + return self._payload + + +class PromptLLM: + """Map prompt fragments to canned responses for workflow tests.""" + + def __init__(self) -> None: + self.calls: list[tuple[str, str]] = [] + + async def chat( + self, + messages: list[dict[str, str]], + *, + model: str | None = None, + timeout_sec: float | None = None, + ) -> str: + """Return the scripted response for the latest user prompt.""" + + del timeout_sec + system = messages[0]["content"] + prompt = messages[-1]["content"] + self.calls.append((model or "", prompt)) + if "Given chunk summaries, score relevance" in prompt: + items = [] + for line in prompt.splitlines(): + if line.startswith("- c"): + chunk_id = line.split()[1].rstrip(":") + score = 95 if "cpu" in line.lower() or "synapse" in line.lower() else 80 + items.append({"id": chunk_id, "score": score, "reason": "relevant"}) + return json.dumps(items or [{"id": "c0", "score": 90, "reason": "relevant"}]) + direct = self._direct_response(prompt) + if direct is not None: + return direct + response = self._lookup_response(system, prompt) + if response is not None: + return response + raise AssertionError(f"Unhandled prompt:\nSYSTEM={system}\nPROMPT={prompt}") + + def _direct_response(self, prompt: str) -> str | None: + """Return direct string responses for a few prompt families.""" + + if "Answer the sub-question using the context" in prompt: + return "The best runbook path is runbooks/fix.md." if "runbook" in prompt.lower() else "synapse is hottest with cpu 95 on titan-01." + markers = [ + ("Write a final response to the user", "titan-99 is hottest and the runbook is runbooks/wrong.md."), + ("Draft:", "synapse is hottest at cpu 95 on titan-01, and amd64 nodes remain separate from raspberry hardware."), + ("Return JSON with fields: issues", '{"issues":["mention the exact runbook"],"missing_data":[],"risky_claims":[]}'), + ("command (string), rationale", '{"command":"kubectl top pods -n synapse","rationale":"verify namespace cpu"}'), + ("confidence (0-100)", '{"confidence":88,"relevance":91,"satisfaction":86,"hallucination_risk":"low"}'), + ] + for marker, response in markers: + if marker in prompt: + if marker == "Draft:" and "If Facts are provided" not in prompt: + continue + return response + return None + + def _lookup_response(self, system: str, prompt: str) -> str | None: + """Return canned responses for prompt markers.""" + + del system + markers = [ + ( + "normalized (string), keywords", + '{"normalized":"Which namespace is hottest on raspberry hardware and which runbook should I use?","keywords":["namespace","hottest","cpu","raspberry","runbook"]}', + ), + ( + "needs_snapshot (bool)", + '{"needs_snapshot":true,"needs_kb":true,"needs_tool":true,"answer_style":"insightful","follow_up":false,"question_type":"open_ended","focus_entity":"namespace","focus_metric":"cpu"}', + ), + ( + "Generate up to", + '[{"id":"q1","question":"Which namespace is hottest?","priority":5,"kind":"metric"},{"id":"q2","question":"Which runbook applies?","priority":4,"kind":"context"}]', + ), + ("Choose the run that best aligns", '{"selected_index": 1}'), + ("AvailableKeys:", '{"keys":["namespace_cpu_top","namespace_pods","hardware_nodes"]}'), + ("Return JSON with field: missing", '{"missing":[]}'), + ("Return JSON with fields: prefixes", '{"prefixes":["namespace","hottest"]}'), + ("fact_types", '{"fact_types":["namespace_cpu_top","hardware_nodes"]}'), + ("Return JSON with field: signals", '{"signals":["cpu","synapse","raspberry"]}'), + ( + "Signals:", + '{"lines":["namespace_cpu_top: synapse=95","hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)"]}', + ), + ( + "Return JSON with field: lines", + '{"lines":["namespace_cpu_top: synapse=95","hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)"]}', + ), + ( + "CandidateFacts:", + '{"lines":["namespace_cpu_top: synapse=95","hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)"]}', + ), + ( + "FactCandidates:", + '{"lines":["namespace_cpu_top: synapse=95","hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)"]}', + ), + ( + "Suggest a safe, read-only command", + '{"command":"kubectl top pods -n synapse","rationale":"verify namespace cpu"}', + ), + ("Pick the best candidate for accuracy and grounding", '{"best": 1}'), + ("Pick the best draft for accuracy", '{"best": 1}'), + ("Pick the best runbook path", '{"path":"runbooks/fix.md"}'), + ("Check the draft against the context", "synapse is hottest on titan-01, but see runbooks/wrong.md."), + ("Answer using the fact", "Latest metrics: namespace_cpu_top: synapse=95."), + ("Rewrite the draft to only include claims supported by FactsUsed", "synapse is hottest on titan-01."), + ("Check if an open-ended answer includes at least two concrete signals", '{"ok": false, "reason": "needs more detail"}'), + ("ok (bool), reason (string)", '{"ok": false, "reason": "needs more detail"}'), + ("Rewrite the answer using the critique", "synapse is hottest at cpu 95 on titan-01. Use runbooks/fix.md."), + ("Return JSON with field: note", '{"note":"The answer would benefit from per-pod CPU samples."}'), + ("Score response quality", '{"confidence":88,"relevance":91,"satisfaction":86,"hallucination_risk":"low"}'), + ( + "Return JSON with fields: confidence (0-100), relevance (0-100), satisfaction (0-100), hallucination_risk (low|medium|high).", + '{"confidence":88,"relevance":91,"satisfaction":86,"hallucination_risk":"low"}', + ), + ( + "claims list", + '{"claims":[{"id":"c1","claim":"synapse is hottest","evidence":[{"path":"hottest.cpu.node","reason":"snapshot"}]}]}', + ), + ("Select the claims most relevant", '{"claim_ids":["c1"]}'), + ("Follow-up:", "titan-99 is still hottest."), + ("Rewrite the answer to be concise and directly answer the question", "Latest metrics: namespace_cpu_top: synapse=95."), + ("Deduplicate repeated statements", "Latest metrics: namespace_cpu_top: synapse=95."), + ("Answer using only the Fact Sheet", "Fact sheet answer: namespace_cpu_top: synapse=95. Use runbooks/fix.md."), + ] + for marker, response in markers: + if marker in prompt: + return response + return None + + +class TimeoutLLM: + """Raise a timeout as soon as the workflow makes an LLM call.""" + + async def chat( + self, + messages: list[dict[str, str]], + *, + model: str | None = None, + timeout_sec: float | None = None, + ) -> str: + """Trigger the workflow timeout handling branch.""" + + del messages, model, timeout_sec + raise TimeoutError("boom") + + +class LimitLLM(PromptLLM): + """Reuse prompt handling while allowing the workflow to hit call caps.""" + + +def _settings(tmp_path: Path, **overrides: Any): + """Build settings with an isolated claim-store path.""" + + return replace(build_test_settings(), state_db_path=str(tmp_path / "state.db"), **overrides) + + +def _make_engine(tmp_path: Path, llm: Any, **setting_overrides: Any) -> AnswerEngine: + """Construct a real engine with static snapshot and KB doubles.""" + + settings = _settings(tmp_path, **setting_overrides) + snapshot = StaticSnapshot(_rich_snapshot()) + kb = KnowledgeBase("") + kb.summary = lambda: "KB summary." # type: ignore[method-assign] + kb.runbook_titles = lambda limit=5: "Relevant runbooks:\n- Fix (runbooks/fix.md)" # type: ignore[method-assign] + kb.runbook_paths = lambda limit=10: ["runbooks/fix.md"] # type: ignore[method-assign] + kb.chunk_lines = lambda max_files=20, max_chars=6000: [ # type: ignore[method-assign] + "runbooks/fix.md", + "namespace_cpu_top: synapse=95", + "hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)", + ] + return AnswerEngine(settings, llm, kb, snapshot) # type: ignore[arg-type] + + +def test_engine_helper_methods_cover_state_and_followup(tmp_path: Path) -> None: + """Cover answer-engine helper branches outside the main workflow.""" + + settings = _settings(tmp_path) + + class StockLLM: + async def chat(self, messages, *, model=None, timeout_sec=None): + del messages, model, timeout_sec + return "stock reply" + + engine = EngineClass(settings, StockLLM(), KnowledgeBase(""), StaticSnapshot(_rich_snapshot())) + + async def call_llm(_system: str, _prompt: str, *, context: str | None = None, model: str | None = None, tag: str = "") -> str: + del _system, context, model + static = { + "draft_select": '{"best": 2}', + "score": '{"confidence":90,"relevance":91,"satisfaction":92,"hallucination_risk":"low"}', + "claim_map": '{"claims":[{"id":"c1","claim":"cpu is high","evidence":[{"path":"hottest.cpu.node","reason":"why"},{"path":"","reason":"skip"}]},"bad"]}', + "select_claims": '{"claim_ids":["c1"]}', + "followup": "titan-99 is hottest. The draft is correct.", + "followup_fix": "titan-01 is hottest.", + "dedup_followup": "The draft is correct. titan-01 is hottest.", + "dedup": "deduped", + } + if tag == "synth": + return "draft one" if "DraftIndex: 1" in _prompt else "draft two" + if tag in static: + return static[tag] + raise AssertionError(tag) + + stock = asyncio.run(engine._answer_stock("hello")) + assert stock.reply == "stock reply" + + plan = replace(_mode_plan(settings, "smart"), drafts=2, parallelism=2) + synth = asyncio.run( + engine._synthesize_answer( + "Which node is hottest?", + ["draft one", "draft two"], + "ctx", + {"question_type": "metric", "answer_style": "direct"}, + plan, + call_llm, + ) + ) + synth_empty = asyncio.run( + engine._synthesize_answer( + "Which node is hottest?", + [], + "ctx", + {"question_type": "metric", "answer_style": "direct"}, + replace(plan, drafts=1, parallelism=1), + call_llm, + ) + ) + assert synth == "draft two" + assert synth_empty == "draft two" + + scored = asyncio.run(engine._score_answer("q", "a", plan, call_llm)) + assert scored.hallucination_risk == "low" + assert asyncio.run(engine._score_answer("q", "a", replace(plan, use_scores=False), call_llm)).confidence == 60 + + summary = build_summary(_rich_snapshot()) + claims = asyncio.run(engine._extract_claims("q", "a", summary, ["fact"], call_llm)) + assert claims and claims[0].evidence[0].path == "hottest.cpu.node" + assert asyncio.run(engine._extract_claims("q", "", summary, [], call_llm)) == [] + assert asyncio.run(engine._dedup_reply("one. one. one.", plan, call_llm, "dedup")) == "deduped" + assert asyncio.run(engine._dedup_reply("single answer", plan, call_llm, "dedup")) == "single answer" + + engine._store_state("conv-1", claims, summary, _rich_snapshot(), True) + state = engine._get_state("conv-1") + assert state and state.snapshot + assert engine._get_state(None) is None + engine._cleanup_state() + + followup = asyncio.run( + engine._answer_followup( + "Which hardware hotspot is there?", + state, + summary, + {"question_type": "diagnostic"}, + plan, + call_llm, + ) + ) + assert "titan-01" in followup + assert asyncio.run(engine._select_claims("what about that?", claims, plan, call_llm)) == ["c1"] + assert asyncio.run(engine._select_claims("what about that?", [], plan, call_llm)) == [] + + +def test_finalize_answer_covers_post_processing_branches(tmp_path: Path) -> None: + """Exercise evidence-fix, runbook, guard, critic, and gap paths.""" + + settings = _settings(tmp_path) + plan = replace(_mode_plan(settings, "smart"), use_gap=True, use_critic=True) + summary = build_summary(_rich_snapshot()) + summary_lines = [ + "namespace_cpu_top: synapse=95", + "hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)", + "runbooks/fix.md", + ] + observed: list[tuple[str, str]] = [] + + async def call_llm(_system: str, _prompt: str, *, context: str | None = None, model: str | None = None, tag: str = "") -> str: + del _system, context, model + responses = { + "runbook_select": '{"path":"runbooks/fix.md"}', + "evidence_fix": "titan-99 is hottest and see runbooks/wrong.md.", + "evidence_fix_enforce": "titan-99 is hottest and see runbooks/wrong.md.", + "metric_direct": "no numbers here", + "runbook_enforce": "Non-Raspberry Pi nodes: amd64 (titan-02). Use runbooks/fix.md.", + "evidence_guard": "Non-Raspberry Pi nodes: amd64 (titan-02). Use runbooks/fix.md.", + "focus_fix": "Latest metrics: namespace_cpu_top: synapse=95.", + "insight_guard": '{"ok": false, "reason": "needs more detail"}', + "insight_fix": "Latest metrics: namespace_cpu_top: synapse=95. Use runbooks/fix.md.", + "critic": '{"issues":["too vague"]}', + "revise": "Latest metrics: namespace_cpu_top: synapse=95. Use runbooks/fix.md.", + "gap": '{"note":"The answer would benefit from per-pod CPU samples."}', + } + if tag not in responses: + raise AssertionError(_prompt) + return responses[tag] + + class FinalizeEngine: + async def _synthesize_answer(self, *args: Any) -> str: + return "titan-99 is hottest and see runbooks/wrong.md." + + async def _dedup_reply(self, reply: str, _plan: ModePlan, _call_llm, tag: str) -> str: + assert tag == "dedup" + return reply + + async def _score_answer(self, _question: str, _reply: str, _plan: ModePlan, _call_llm) -> AnswerScores: + return AnswerScores(80, 81, 82, "low") + + async def _extract_claims(self, _question: str, _reply: str, _summary: dict[str, Any], _facts_used: list[str], _call_llm) -> list[ClaimItem]: + return [ClaimItem(id="c1", claim="cpu high", evidence=[EvidenceItem(path="hottest.cpu.node", reason="snapshot")])] + + reply, scores, claims = asyncio.run( + finalize_answer( + engine=FinalizeEngine(), + call_llm=call_llm, + normalized="Which namespace is hottest on raspberry hardware and which runbook should I use?", + subanswers=["synapse is hottest"], + context="ctx", + classify={"question_type": "open_ended", "answer_style": "direct"}, + plan=plan, + summary=summary, + summary_lines=summary_lines, + metric_facts=["namespace_cpu_top: synapse=95"], + key_facts=["namespace_cpu_top: synapse=95"], + facts_used=["hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)"], + allowed_nodes=["titan-01", "titan-02"], + allowed_namespaces=["synapse"], + runbook_paths=["runbooks/fix.md"], + lowered_question="which namespace is hottest on raspberry hardware and which runbook should i use?", + force_metric=True, + keyword_tokens=["namespace", "cpu", "raspberry"], + question_tokens=["namespace", "cpu", "raspberry"], + snapshot_context="ClusterSnapshot:\nnamespace_cpu_top: synapse=95", + observer=lambda stage, note: observed.append((stage, note)), + mode="smart", + metric_keys=["namespace_cpu_top"], + ) + ) + assert "runbooks/fix.md" in reply + assert "synapse=95" in reply + assert scores.confidence == 80 + assert claims and claims[0].id == "c1" + assert ("evidence_fix", "repairing missing evidence") in observed + assert ("critic", "reviewing") in observed + assert ("gap", "checking gaps") in observed + + +def test_run_answer_deep_workflow_persists_state(tmp_path: Path) -> None: + """Drive the full smart workflow through retrieval, synthesis, and post-processing.""" + + engine = _make_engine(tmp_path, PromptLLM()) + observed: list[tuple[str, str]] = [] + result = asyncio.run( + run_answer( + engine, + "Run limitless Which namespace is hottest on raspberry hardware and which runbook should I use?", + mode="smart", + history=[{"q": "before", "a": "earlier"}], + observer=lambda stage, note: observed.append((stage, note)), + conversation_id="room-1", + snapshot_pin=True, + ) + ) + assert "runbooks/fix.md" in result.reply + assert result.meta["tool_hint"]["command"] == "kubectl top pods -n synapse" + state = engine._get_state("room-1") + assert state and state.claims and state.snapshot + stages = {stage for stage, _note in observed} + assert {"normalize", "route", "retrieve", "tool", "subanswers", "synthesize"} <= stages + + +def test_run_answer_followup_and_limits(tmp_path: Path) -> None: + """Cover follow-up routing, reasoning limit, and timeout fallbacks.""" + + class FollowupLLM(PromptLLM): + def _lookup_response(self, system: str, prompt: str) -> str | None: + if "normalized (string), keywords" in prompt: + return '{"normalized":"What about that?","keywords":["that"]}' + if "needs_snapshot (bool)" in prompt: + return '{"needs_snapshot":true,"needs_kb":false,"needs_tool":false,"answer_style":"direct","follow_up":false,"question_type":"open_ended","focus_entity":"unknown","focus_metric":"unknown"}' + if "Select the claims most relevant" in prompt: + return '{"claim_ids":["c1"]}' + if "Follow-up:" in prompt: + return "titan-99 is still hottest." + return super()._lookup_response(system, prompt) + + engine = _make_engine(tmp_path, FollowupLLM()) + summary = build_summary(_rich_snapshot()) + engine._store_state( + "conv-1", + [ClaimItem(id="c1", claim="synapse is hottest", evidence=[EvidenceItem(path="hottest.cpu.node", reason="snapshot", value_at_claim="titan-01")])], + summary, + _rich_snapshot(), + True, + ) + followup = asyncio.run( + run_answer( + engine, + "Run limitless What about that?", + mode="smart", + conversation_id="conv-1", + snapshot_pin=True, + ) + ) + assert "titan-01" in followup.reply + + limit_engine = _make_engine( + tmp_path / "limit", + LimitLLM(), + fast_llm_calls_max=1, + llm_limit_multiplier=1.0, + ) + limited = asyncio.run(run_answer(limit_engine, "tell me about cpu and runbooks", mode="custom")) + assert "reasoning limit" in limited.reply + assert limited.meta["llm_limit_hit"] is True + + timeout_engine = _make_engine( + tmp_path / "timeout", + TimeoutLLM(), + smart_time_budget_sec=0.1, + ollama_timeout_sec=0.1, + ) + timed_out = asyncio.run(run_answer(timeout_engine, "Run limitless tell me about cpu and runbooks", mode="smart")) + assert "time budget" in timed_out.reply.lower() + assert timed_out.meta["time_budget_hit"] is True + + +def test_api_matrix_queue_main_and_store_edge_paths(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + """Exercise remaining API, Matrix, queue, main, and store branches.""" + + settings = _settings( + tmp_path, + internal_token="secret", + queue_enabled=True, + matrix_bots=(MatrixBotConfig("bot", "pw", ("atlas",), "quick"),), + ) + + async def handler( + question: str, + mode: str, + history: list[dict[str, str]] | None, + conversation_id: str | None, + snapshot_pin: bool | None, + ) -> AnswerResult: + del history, conversation_id, snapshot_pin + return AnswerResult(question + ":" + mode, AnswerScores(1, 2, 3, "low"), {"mode": mode}) + + api = Api(settings, handler) + from fastapi.testclient import TestClient + + client = TestClient(api.app) + assert client.post("/v1/answer", headers={"X-Internal-Token": "secret"}, json={}).status_code == 400 + assert client.post("/v1/answer", headers={"X-Internal-Token": "secret"}, json={"content": "hi"}).json()["reply"] == "hi:quick" + assert client.post("/v1/answer", headers={"X-Internal-Token": "secret"}, json={"question": " "}).status_code == 400 + assert AnswerRequest(message=" hello ").message == " hello " + + class FakeResp: + def __init__(self, payload: dict[str, Any], *, status_code: int = 200) -> None: + self._payload = payload + self.status_code = status_code + + def raise_for_status(self) -> None: + if self.status_code >= 400: + raise httpx.HTTPStatusError("bad", request=httpx.Request("GET", "http://x"), response=httpx.Response(self.status_code)) + + def json(self) -> dict[str, Any]: + return self._payload + + class MatrixAsyncClient: + async def __aenter__(self) -> "MatrixAsyncClient": + return self + + async def __aexit__(self, *exc: object) -> None: + return None + + async def post(self, url: str, json: dict[str, Any] | None = None, headers: dict[str, str] | None = None) -> FakeResp: + del json, headers + if "login" in url: + return FakeResp({"access_token": "tok"}) + return FakeResp({}) + + async def get(self, url: str, headers: dict[str, str] | None = None, params: dict[str, Any] | None = None) -> FakeResp: + del headers, params + if "directory/room" in url: + return FakeResp({}, status_code=404) + return FakeResp({"next_batch": "n1", "rooms": {"join": {}}}) + + monkeypatch.setattr("atlasbot.matrix.bot.httpx.AsyncClient", lambda timeout=None: MatrixAsyncClient()) + matrix_client = MatrixClient(settings, settings.matrix_bots[0]) + assert asyncio.run(matrix_client.login()) == "tok" + assert asyncio.run(matrix_client.resolve_room("tok")) == "" + + bot = MatrixBot(settings, settings.matrix_bots[0], SimpleNamespace(answer=None), handler) + + class BotClient: + def __init__(self) -> None: + self.sent: list[str] = [] + self.sync_calls = 0 + + async def login(self) -> str: + return "tok" + + async def resolve_room(self, token: str) -> str: + del token + return "!room" + + async def join_room(self, token: str, room_id: str) -> None: + del token, room_id + + async def send_message(self, token: str, room_id: str, text: str) -> None: + del token, room_id + self.sent.append(text) + + async def sync(self, token: str, since: str | None) -> dict[str, Any]: + del token, since + self.sync_calls += 1 + if self.sync_calls == 1: + return { + "next_batch": "n1", + "rooms": { + "join": { + "!room": { + "timeline": { + "events": [ + {"type": "m.room.member", "sender": "user"}, + {"type": "m.room.message", "sender": "bot", "content": {"body": "ignore"}}, + {"type": "m.room.message", "sender": "user", "content": {"body": "atlas quick hi"}}, + ] + } + } + } + }, + } + raise RuntimeError("stop") + + bot._client = BotClient() + async def run_bot_once() -> None: + task = asyncio.create_task(bot.run()) + await asyncio.sleep(0.01) + task.cancel() + with pytest.raises(asyncio.CancelledError): + await task + + asyncio.run(run_bot_once()) + assert any("Thinking" in msg for msg in bot._client.sent) + + timeout_bot = MatrixBot(replace(settings, thinking_interval_sec=0.001, quick_time_budget_sec=0.01), settings.matrix_bots[0], SimpleNamespace(answer=None), None) + timeout_bot._client = SimpleNamespace( + sent=[], + send_message=lambda token, room_id, text: asyncio.sleep(0, result=timeout_bot._client.sent.append(text)), + ) + + async def sleepy_handler(question: str, mode: str, history, conversation_id, observer): + del question, mode, history, conversation_id, observer + await asyncio.sleep(1.2) + return AnswerResult("late", AnswerScores(1, 2, 3, "low"), {}) + + timeout_bot._answer_handler = sleepy_handler + asyncio.run(timeout_bot._answer_with_heartbeat("tok", "!room", "q", "quick")) + assert any("time budget" in msg for msg in timeout_bot._client.sent) + + error_bot = MatrixBot(replace(settings, thinking_interval_sec=0.001), settings.matrix_bots[0], SimpleNamespace(answer=None), None) + error_bot._client = SimpleNamespace( + sent=[], + send_message=lambda token, room_id, text: asyncio.sleep(0, result=error_bot._client.sent.append(text)), + ) + + async def failing_handler(question: str, mode: str, history, conversation_id, observer): + del question, mode, history, conversation_id, observer + raise RuntimeError("boom") + + error_bot._answer_handler = failing_handler + asyncio.run(error_bot._answer_with_heartbeat("tok", "!room", "q", "smart")) + assert any("internal error" in msg for msg in error_bot._client.sent) + + class DirectQueue: + async def __call__(self, payload: dict[str, Any]) -> dict[str, Any]: + return {"reply": payload["question"]} + + direct_qm = QueueManager(replace(settings, queue_enabled=False), DirectQueue()) + assert asyncio.run(direct_qm.submit({"question": "direct"})) == {"reply": "direct"} + + class FakeSub: + async def next_msg(self, timeout: float) -> Any: + del timeout + return SimpleNamespace(data=json.dumps({"reply": "queued"}).encode()) + + async def unsubscribe(self) -> None: + return None + + class FakeMsg: + def __init__(self, raw: bytes, reply: str = "reply") -> None: + self.data = raw + self.reply = reply + self.acked = False + + async def ack(self) -> None: + self.acked = True + + published: list[tuple[str, bytes]] = [] + + class ExistingStreamJS: + async def stream_info(self, stream: str) -> None: + assert stream == settings.nats_stream + + async def publish(self, subject: str, data: bytes) -> None: + published.append((subject, data)) + + async def pull_subscribe(self, subject: str, durable: str): + del subject, durable + + class Pull: + def __init__(self) -> None: + self.calls = 0 + + async def fetch(self, count: int, timeout: float) -> list[FakeMsg]: + del count, timeout + self.calls += 1 + if self.calls == 1: + raise RuntimeError("retry") + raise asyncio.CancelledError + + return Pull() + + class FakeNats: + def __init__(self) -> None: + self.drained = False + + async def connect(self, url: str) -> None: + assert url == settings.nats_url + + def jetstream(self) -> ExistingStreamJS: + return ExistingStreamJS() + + def new_inbox(self) -> str: + return "inbox" + + async def subscribe(self, reply: str) -> FakeSub: + assert reply == "inbox" + return FakeSub() + + async def publish(self, reply: str, data: bytes) -> None: + published.append((reply, data)) + + async def drain(self) -> None: + self.drained = True + + monkeypatch.setattr("atlasbot.queue.nats.NATS", FakeNats) + queue = QueueManager(settings, DirectQueue()) + asyncio.run(queue.start()) + assert asyncio.run(queue.submit({"question": "queued", "mode": "smart"})) == {"reply": "queued"} + + invalid_msg = FakeMsg(b"not-json") + asyncio.run(queue._handle_message(invalid_msg)) + assert invalid_msg.acked is True + handled_msg = FakeMsg(json.dumps({"payload": {"question": "x"}, "reply": "reply"}).encode()) + asyncio.run(queue._handle_message(handled_msg)) + assert handled_msg.acked is True + failing_queue = QueueManager(settings, lambda payload: (_ for _ in ()).throw(RuntimeError("boom"))) + failing_queue._nc = FakeNats() + failing_queue._js = ExistingStreamJS() + failure_msg = FakeMsg(json.dumps({"payload": {"question": "x"}}).encode()) + + async def failing_handler(payload: dict[str, Any]) -> dict[str, Any]: + del payload + raise RuntimeError("boom") + + failing_queue._handler = failing_handler + asyncio.run(failing_queue._handle_message(failure_msg)) + assert failure_msg.acked is True + asyncio.run(queue.stop()) + + assert result_scores({"scores": {"confidence": "9", "relevance": "8", "satisfaction": "7", "hallucination_risk": "low"}}).confidence == 9 + assert result_scores({"scores": "bad"}).confidence == 60 + + +def test_kb_llm_snapshot_and_json_edge_paths(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + """Cover remaining KB, LLM, snapshot, and JSON parsing branches.""" + + base = tmp_path / "kb" + catalog = base / "catalog" + catalog.mkdir(parents=True) + (catalog / "atlas.json").write_text(json.dumps({"cluster": "atlas", "sources": ["bad"]}), encoding="utf-8") + (catalog / "runbooks.json").write_text(json.dumps([{"title": "Fix", "path": "runbooks/fix.md"}, {"title": "No path"}]), encoding="utf-8") + (base / "docs.md").write_text("x" * 120, encoding="utf-8") + kb = KnowledgeBase(str(base)) + assert kb.runbook_titles(limit=1).count("runbooks/fix.md") == 1 + assert kb.chunk_lines(max_files=1, max_chars=60) + assert kb._extend_with_limit([], ["abcdef"], 3) is False + + empty_kb = KnowledgeBase("") + assert empty_kb.chunk_lines() == [] + + settings = _settings(tmp_path, ollama_url="http://example/api/chat", ollama_api_key="secret", ollama_retries=0, ollama_fallback_model="") + client = LLMClient(settings) + assert client._endpoint() == "http://example/api/chat" + assert client._headers["x-api-key"] == "secret" + assert parse_json("```{\"ok\": true}```") == {"ok": True} + assert parse_json("not-json", fallback={"fallback": True}) == {"fallback": True} + + class FakeResponse: + def __init__(self, status_code: int, payload: Any) -> None: + self.status_code = status_code + self._payload = payload + + def raise_for_status(self) -> None: + if self.status_code >= 400: + raise httpx.HTTPStatusError("bad", request=httpx.Request("POST", "http://example"), response=httpx.Response(self.status_code)) + + def json(self) -> Any: + return self._payload + + responses = iter([FakeResponse(200, {"response": "plain"}), FakeResponse(200, {"reply": "fallback"}), FakeResponse(200, {"message": {}})]) + + class FakeAsyncClient: + def __init__(self, timeout: float | None = None) -> None: + self.timeout = timeout + + async def __aenter__(self) -> "FakeAsyncClient": + return self + + async def __aexit__(self, *exc: object) -> None: + return None + + async def post(self, _url: str, *, json: dict[str, Any], headers: dict[str, str]) -> FakeResponse: + del _url, json, headers + item = next(responses) + if isinstance(item, Exception): + raise item + return item + + monkeypatch.setattr(httpx, "AsyncClient", FakeAsyncClient) + assert asyncio.run(client.chat([{"role": "user", "content": "a"}], timeout_sec=1.0)) == "plain" + assert asyncio.run(client.chat([{"role": "user", "content": "b"}], timeout_sec=1.0)) == "fallback" + with pytest.raises(LLMError, match="empty response"): + asyncio.run(client.chat([{"role": "user", "content": "c"}], timeout_sec=1.0)) + error_settings = replace(settings, ollama_retries=1) + error_client = LLMClient(error_settings) + error_responses = iter([httpx.ConnectError("nope"), httpx.ConnectError("still nope")]) + + class ErrorAsyncClient(FakeAsyncClient): + async def post(self, _url: str, *, json: dict[str, Any], headers: dict[str, str]) -> FakeResponse: + del _url, json, headers + raise next(error_responses) + + monkeypatch.setattr(httpx, "AsyncClient", ErrorAsyncClient) + with pytest.raises(LLMError): + asyncio.run(error_client.chat([{"role": "user", "content": "d"}], timeout_sec=1.0)) + + provider = SnapshotProvider(replace(settings, ariadne_state_url="http://snapshot", ariadne_state_token="tok")) + + class SnapshotResp: + def raise_for_status(self) -> None: + return None + + def json(self) -> dict[str, Any]: + return {"snapshot_id": "snap-1"} + + monkeypatch.setattr("atlasbot.snapshot.builder.httpx.get", lambda url, headers, timeout: SnapshotResp()) + assert provider.get() == {"snapshot_id": "snap-1"} + provider._cache = {"snapshot_id": "cached"} + provider._cache_ts = 10_000.0 + monkeypatch.setattr("atlasbot.snapshot.builder.time.monotonic", lambda: 10_001.0) + assert provider.get() == {"snapshot_id": "cached"} diff --git a/tests/test_split_helper_coverage.py b/tests/test_split_helper_coverage.py new file mode 100644 index 0000000..cb4dd9a --- /dev/null +++ b/tests/test_split_helper_coverage.py @@ -0,0 +1,1749 @@ +"""Targeted coverage tests for Atlasbot's split helper modules.""" + +from __future__ import annotations + +import asyncio +import json +import runpy +from dataclasses import replace +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import httpx +import pytest + +from atlasbot.config import MatrixBotConfig +from atlasbot.engine.answerer import common as answer_common +from atlasbot.engine.answerer import engine as answer_engine +from atlasbot.engine.answerer import factsheet as answer_factsheet +from atlasbot.engine.answerer import post as answer_post +from atlasbot.engine.answerer import post_ext as answer_post_ext +from atlasbot.engine.answerer import retrieval as answer_retrieval +from atlasbot.engine.answerer import retrieval_ext as answer_retrieval_ext +from atlasbot.engine.answerer import spine as answer_spine +from atlasbot.engine.answerer import workflow as answer_workflow +from atlasbot.engine.answerer import workflow_post as answer_workflow_post +from atlasbot.engine.answerer._base import ( + AnswerResult, + AnswerScores, + ClaimItem, + ContradictionContext, + EvidenceItem, + InsightGuardInput, +) +from atlasbot.knowledge.loader import KnowledgeBase +from atlasbot.llm.client import LLMClient, LLMError +from atlasbot.main import _build_engine, result_scores +from atlasbot.matrix.bot import MatrixBot, MatrixClient, _extract_mode, _mode_timeout_sec +from atlasbot.snapshot.builder import SnapshotProvider, core_a, format_a, format_b, format_c +from testing.fakes import build_test_settings + + +class ScriptedCall: + """Return canned async responses keyed by tag.""" + + def __init__(self, responses: dict[str, Any]) -> None: + self._responses = { + key: list(value) if isinstance(value, list) else value for key, value in responses.items() + } + self.calls: list[str] = [] + + async def __call__( + self, + _system: str, + _prompt: str, + *, + context: str | None = None, + model: str | None = None, + tag: str = "", + ) -> str: + del context, model + self.calls.append(tag) + value = self._responses.get(tag, "{}") + if isinstance(value, list): + if not value: + return "{}" + item = value.pop(0) + return str(item) + return str(value) + + +def test_knowledge_base_private_paths(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + """Cover runbook, catalog, and file-scanning edge branches.""" + + base = tmp_path / "kb" + catalog = base / "catalog" + catalog.mkdir(parents=True) + (catalog / "atlas.json").write_text(json.dumps({"cluster": "atlas", "sources": []}), encoding="utf-8") + (catalog / "runbooks.json").write_text( + json.dumps([{"title": "Good", "path": "runbooks/good.md"}, {"title": "MissingPath"}, "bad-entry"]), + encoding="utf-8", + ) + (base / "notes.md").write_text("alpha\nbeta", encoding="utf-8") + (base / "empty.txt").write_text("", encoding="utf-8") + (base / "bad.md").write_text("boom", encoding="utf-8") + + kb = KnowledgeBase(str(base)) + assert kb.runbook_titles(limit=1) == "Relevant runbooks:\n- Good (runbooks/good.md)" + assert kb.runbook_paths(limit=5) == ["runbooks/good.md"] + assert kb.chunk_lines(max_files=1, max_chars=120) + + lines: list[str] = [] + kb._runbooks = [{"title": "Good", "path": "runbooks/good.md"}, "bad-entry"] # type: ignore[assignment] + kb._append_runbooks(lines) + assert "KB: runbooks.json" in lines + assert "- Good (runbooks/good.md)" in lines + + monkeypatch.setattr("atlasbot.knowledge.loader.json.dumps", lambda *_args, **_kwargs: (_ for _ in ()).throw(RuntimeError("nope"))) + before = list(lines) + kb._append_catalog(lines, max_chars=999) + assert lines == before + + original_read_text = Path.read_text + + def fake_read_text(self: Path, *args: Any, **kwargs: Any) -> str: + if self.name == "bad.md": + raise OSError("blocked") + return original_read_text(self, *args, **kwargs) + + monkeypatch.setattr(Path, "read_text", fake_read_text) + file_lines: list[str] = [] + kb._append_files(file_lines, max_files=1, max_chars=120) + assert any(line.startswith("KB File: notes.md") for line in file_lines) + + empty = KnowledgeBase("") + assert empty.chunk_lines() == [] + + +def test_knowledge_base_limit_and_break_paths(tmp_path: Path) -> None: + """Cover size-guard exits that only trigger near prompt limits.""" + + base = tmp_path / "kb" + catalog = base / "catalog" + catalog.mkdir(parents=True) + (catalog / "atlas.json").write_text(json.dumps({"cluster": "atlas"}), encoding="utf-8") + (base / "notes.md").write_text("alpha", encoding="utf-8") + + kb = KnowledgeBase(str(base)) + assert any(line.startswith("KB File: notes.md") for line in kb.chunk_lines(max_files=2, max_chars=500)) + + no_atlas_lines = ["seed"] + kb._atlas = None + kb._append_catalog(no_atlas_lines, max_chars=500) + assert no_atlas_lines == ["seed"] + + over_limit_lines = ["x" * 25] + kb._atlas = {"cluster": "atlas"} + kb._append_catalog(over_limit_lines, max_chars=10) + assert over_limit_lines == ["x" * 25] + + runbook_lines = ["seed"] + kb._runbooks = [] + kb._append_runbooks(runbook_lines) + assert runbook_lines == ["seed"] + + limit_lines = ["x" * 50] + kb._append_files(limit_lines, max_files=2, max_chars=20) + assert limit_lines == ["x" * 50] + + capped_lines = ["seed"] * 51 + kb._append_files(capped_lines, max_files=1, max_chars=1_000) + assert capped_lines == ["seed"] * 51 + + extend_lines: list[str] = [] + kb._append_files(extend_lines, max_files=5, max_chars=18) + assert extend_lines == ["KB File: notes.md"] + + +def test_llm_client_timeout_fallback_and_parse(monkeypatch: pytest.MonkeyPatch) -> None: + """Exercise timeout, fallback-model, and empty-response branches.""" + + settings = replace(build_test_settings(), ollama_url="http://ollama/api/chat", ollama_api_key="secret") + client = LLMClient(settings) + assert client._endpoint() == "http://ollama/api/chat" + assert client._headers["x-api-key"] == "secret" + + with pytest.raises(LLMError, match="timeout"): + asyncio.run(client.chat([{"role": "user", "content": "hi"}], timeout_sec=0.0)) + + class FakeResponse: + def __init__(self, status_code: int, payload: dict[str, Any]): + self.status_code = status_code + self._payload = payload + + def raise_for_status(self) -> None: + if self.status_code >= 400: + raise httpx.HTTPStatusError( + "bad", + request=httpx.Request("POST", "http://ollama"), + response=httpx.Response(self.status_code), + ) + + def json(self) -> dict[str, Any]: + return self._payload + + calls: list[str] = [] + + class FallbackClient: + def __init__(self, timeout: float | None = None) -> None: + self.timeout = timeout + + async def __aenter__(self) -> "FallbackClient": + return self + + async def __aexit__(self, *exc: object) -> None: + return None + + async def post(self, _url: str, *, json: dict[str, Any], headers: dict[str, str]) -> FakeResponse: + calls.append(json["model"]) + assert headers["Content-Type"] == "application/json" + if json["model"] == "base": + return FakeResponse(404, {}) + return FakeResponse(200, {"message": {"content": "ok"}}) + + monkeypatch.setattr(httpx, "AsyncClient", FallbackClient) + fallback_client = LLMClient(replace(settings, ollama_model="base", ollama_fallback_model="backup", ollama_retries=1)) + assert asyncio.run(fallback_client.chat([{"role": "user", "content": "hello"}])) == "ok" + assert calls == ["base", "backup"] + + class EmptyClient(FallbackClient): + async def post(self, _url: str, *, json: dict[str, Any], headers: dict[str, str]) -> FakeResponse: + del json, headers + return FakeResponse(200, {}) + + monkeypatch.setattr(httpx, "AsyncClient", EmptyClient) + with pytest.raises(LLMError, match="empty response"): + asyncio.run(LLMClient(settings).chat([{"role": "user", "content": "hello"}])) + + +def test_llm_client_deadline_and_exhausted_fallback(monkeypatch: pytest.MonkeyPatch) -> None: + """Cover the timeout-after-error and retry-exhausted fallback edges.""" + + settings = replace(build_test_settings(), ollama_url="http://ollama") + + class TimeoutClient: + def __init__(self, timeout: float | None = None) -> None: + self.timeout = timeout + + async def __aenter__(self) -> "TimeoutClient": + return self + + async def __aexit__(self, *exc: object) -> None: + return None + + async def post(self, _url: str, *, json: dict[str, Any], headers: dict[str, str]) -> str: + del _url, json, headers + raise RuntimeError("boom") + + moments = iter((100.0, 100.0, 100.2)) + with monkeypatch.context() as local_patch: + local_patch.setattr(httpx, "AsyncClient", TimeoutClient) + local_patch.setattr("atlasbot.llm.client.time", SimpleNamespace(monotonic=lambda: next(moments))) + with pytest.raises(LLMError, match="timeout"): + asyncio.run(LLMClient(replace(settings, ollama_retries=0)).chat([{"role": "user", "content": "late"}], timeout_sec=0.1)) + + class FallbackResponse: + status_code = 404 + + def raise_for_status(self) -> None: + return None + + def json(self) -> dict[str, str]: + return {} + + class FallbackOnlyClient(TimeoutClient): + async def post(self, _url: str, *, json: dict[str, Any], headers: dict[str, str]) -> FallbackResponse: + del _url, json, headers + return FallbackResponse() + + monkeypatch.setattr(httpx, "AsyncClient", FallbackOnlyClient) + with pytest.raises(LLMError, match="ollama retries exhausted"): + asyncio.run( + LLMClient(replace(settings, ollama_model="base", ollama_fallback_model="backup", ollama_retries=0)).chat( + [{"role": "user", "content": "fallback"}], + timeout_sec=1.0, + ) + ) + + +def test_result_scores_and_build_engine(tmp_path: Path) -> None: + """Cover score coercion fallbacks and engine construction.""" + + settings = replace(build_test_settings(), kb_dir="", state_db_path=str(tmp_path / "state.db")) + engine = _build_engine(settings) + assert isinstance(engine, answer_engine.AnswerEngine) + + good = result_scores({"scores": {"confidence": 91, "relevance": "88", "satisfaction": 77.1, "hallucination_risk": "low"}}) + assert good.confidence == 91 + assert result_scores({"scores": {"confidence": "broken"}}).confidence == 60 + assert result_scores("bad-payload").hallucination_risk == "medium" # type: ignore[arg-type] + + +def test_main_module_script_entrypoint(monkeypatch: pytest.MonkeyPatch) -> None: + """Cover the `python -m atlasbot.main` entrypoint without booting services.""" + + class StopMain(RuntimeError): + """Stop the module after the entrypoint invokes asyncio.run.""" + + def fake_run(coro: Any) -> None: + coro.close() + raise StopMain("stop") + + monkeypatch.setattr(asyncio, "run", fake_run) + with pytest.raises(StopMain, match="stop"): + runpy.run_module("atlasbot.main", run_name="__main__") + + +def test_matrix_client_and_bot_error_paths(monkeypatch: pytest.MonkeyPatch) -> None: + """Cover Matrix error handling, ignored events, and mode extraction branches.""" + + settings = replace(build_test_settings(), matrix_base="http://matrix", auth_base="http://auth", room_alias="#atlas:example") + bot_cfg = MatrixBotConfig("atlasbot", "pw", ("atlas", "atlas-smart"), "quick") + + class ErrorClient: + def __init__(self, timeout: float | None = None) -> None: + self.timeout = timeout + + async def __aenter__(self) -> "ErrorClient": + return self + + async def __aexit__(self, *exc: object) -> None: + return None + + async def post(self, *_args: Any, **_kwargs: Any) -> SimpleNamespace: + return SimpleNamespace(status_code=200, raise_for_status=lambda: None, json=lambda: {"access_token": "tok"}) + + async def get(self, url: str, **_kwargs: Any) -> SimpleNamespace: + if "directory/room" in url: + raise httpx.HTTPError("no room") + return SimpleNamespace(raise_for_status=lambda: None, json=lambda: {"next_batch": "n2"}) + + monkeypatch.setattr("atlasbot.matrix.bot.httpx.AsyncClient", ErrorClient) + client = MatrixClient(settings, bot_cfg) + assert asyncio.run(client.resolve_room("tok")) == "" + assert asyncio.run(client.sync("tok", "batch-1"))["next_batch"] == "n2" + + mode, cleaned = _extract_mode("Atlas-smart hello", ("atlas",), "") + assert mode == "smart" + assert cleaned == "-smart hello" + assert _mode_timeout_sec(settings, "genius") == settings.genius_time_budget_sec + + class FakeMatrixClient: + def __init__(self) -> None: + self.sent: list[str] = [] + self.login_calls = 0 + self.sync_calls = 0 + + async def login(self) -> str: + self.login_calls += 1 + raise RuntimeError("boot failed") + + async def resolve_room(self, token: str) -> str: + del token + return "" + + async def join_room(self, token: str, room_id: str) -> None: + del token, room_id + + async def send_message(self, token: str, room_id: str, text: str) -> None: + del token, room_id + self.sent.append(text) + + async def sync(self, token: str, since: str | None) -> dict[str, Any]: + del token, since + self.sync_calls += 1 + raise RuntimeError("sync failed") + + sleeps = {"count": 0} + + async def fake_sleep(_seconds: float) -> None: + sleeps["count"] += 1 + raise asyncio.CancelledError + + monkeypatch.setattr("atlasbot.matrix.bot.asyncio.sleep", fake_sleep) + bot = MatrixBot(settings, bot_cfg, SimpleNamespace(answer=None), None) + bot._client = FakeMatrixClient() + with pytest.raises(asyncio.CancelledError): + asyncio.run(bot.run()) + with pytest.raises(asyncio.CancelledError): + asyncio.run(bot._sync_loop("tok")) + assert sleeps["count"] >= 2 + + class SendOnlyClient(FakeMatrixClient): + async def login(self) -> str: + return "tok" + + async def handler(question: str, mode: str, history: list[dict[str, str]] | None, conversation_id: str | None, observer): + del history, conversation_id + if observer: + observer("phase", "working") + return AnswerResult(reply=f"{mode}:{question}", scores=AnswerScores(1, 2, 3, "low"), meta={}) + + bot2 = MatrixBot(replace(settings, thinking_interval_sec=0.001), bot_cfg, SimpleNamespace(answer=None), handler) + bot2._client = SendOnlyClient() + payload = { + "rooms": { + "join": { + "!room": { + "timeline": { + "events": [ + "junk", + {"type": "m.presence", "sender": "user", "content": {}}, + {"type": "m.room.message", "sender": "atlasbot", "content": {"body": "ignore self"}}, + {"type": "m.room.message", "sender": "user", "content": {"body": "atlas what is up?"}}, + ] + } + } + } + } + } + asyncio.run(bot2._handle_sync("tok", payload)) + assert any("Thinking" in item for item in bot2._client.sent) + + +def test_matrix_bot_timeout_variants() -> None: + """Cover smart and genius timeout messages separately.""" + + settings = build_test_settings() + bot_cfg = MatrixBotConfig("atlasbot", "pw", ("atlas", "atlas-smart"), "quick") + + async def sleepy_handler(question: str, mode: str, history, conversation_id, observer): + del question, mode, history, conversation_id, observer + await asyncio.sleep(1.2) + return AnswerResult("late", AnswerScores(1, 2, 3, "low"), {}) + + smart_bot = MatrixBot(replace(settings, thinking_interval_sec=0.001, smart_time_budget_sec=0.01), bot_cfg, SimpleNamespace(answer=None), sleepy_handler) + smart_bot._client = SimpleNamespace( + sent=[], + send_message=lambda token, room_id, text: asyncio.sleep(0, result=smart_bot._client.sent.append(text)), + ) + asyncio.run(smart_bot._answer_with_heartbeat("tok", "!room", "q", "smart")) + assert any("atlas-genius" in msg for msg in smart_bot._client.sent) + + genius_bot = MatrixBot(replace(settings, thinking_interval_sec=0.001, genius_time_budget_sec=0.01), bot_cfg, SimpleNamespace(answer=None), sleepy_handler) + genius_bot._client = SimpleNamespace( + sent=[], + send_message=lambda token, room_id, text: asyncio.sleep(0, result=genius_bot._client.sent.append(text)), + ) + asyncio.run(genius_bot._answer_with_heartbeat("tok", "!room", "q", "genius")) + assert any("ran out of time" in msg for msg in genius_bot._client.sent) + + +def test_answer_common_helper_paths() -> None: + """Cover common chunk-selection and scoring helpers.""" + + settings = replace(build_test_settings(), debug_pipeline=True) + meta = answer_common._build_meta("smart", 2, 5, True, False, 45.0, {"question_type": "metric"}, {"tool": "facts"}, started=0.0) + assert meta["llm_limit_hit"] is True + assert answer_common._llm_call_limit(settings, "smart") == settings.smart_llm_calls_max + assert answer_common._mode_time_budget(settings, "genius") == settings.genius_time_budget_sec + assert answer_common._select_subquestions([{"question": "A", "priority": "nope"}, {"question": "B", "priority": 3}], "fallback", 2) == ["B", "A"] + assert answer_common._chunk_lines(["a", "b", "c"], 2)[0]["summary"] == "a | b" + assert answer_common._raw_snapshot_chunks({"ok": 1, "bad": {1, 2}}) + assert answer_common._build_chunk_groups([{"id": "c1", "summary": "s1"}, {"id": "c2", "summary": "s2"}], 1) == [[{"id": "c1", "summary": "s1"}], [{"id": "c2", "summary": "s2"}]] + assert answer_common._merge_score_runs([{"a": 2.0}, {"a": 4.0, "b": 6.0}]) == {"a": 3.0, "b": 6.0} + + chunks = [{"id": "c1", "text": "atlas cpu 90", "summary": "cpu"}, {"id": "c2", "text": "storage okay", "summary": "storage"}] + ranked = answer_common._select_chunks(chunks, {"c1": 1.0, "c2": 0.5}, answer_common._mode_plan(settings, "smart"), ["storage"], ["c2"]) + assert ranked[0]["id"] == "c1" + assert any(item["id"] == "c2" for item in ranked) + assert answer_common._format_runbooks(["runbooks/fix.md"]).startswith("Relevant runbooks:") + + scripted = ScriptedCall( + { + "chunk_score": '[{"id":"c1","score":1},{"id":"c2","score":"2"}]', + "chunk_select": '{"selected_index": 5}', + } + ) + plan = replace(answer_common._mode_plan(settings, "smart"), score_retries=2, parallelism=2, chunk_group=1) + scores = asyncio.run(answer_common._score_chunks(scripted, chunks, "What is hot?", ["cpu?"], plan)) + assert scores["c1"] >= 0.0 + assert scores["c2"] >= 0.0 + best = asyncio.run( + answer_common._select_best_score_run( + scripted, + [{"id": "c1", "summary": "cpu"}], + [{"c1": 2.0}, {"c1": 8.0}], + answer_common.ScoreContext("q", ["sq"], 2, 2, True, "fast"), + ) + ) + assert best == {"c1": 2.0} + + +def test_answer_common_edge_branches() -> None: + """Cover low-frequency common helper branches and fallbacks.""" + + settings = build_test_settings() + plan = answer_common._mode_plan(settings, "smart") + + assert answer_common._strip_followup_meta("") == "" + assert answer_common._strip_followup_meta("Based on the context, Atlas is warm.") == "Atlas is warm." + assert answer_common._raw_snapshot_chunks(None) == [] + assert asyncio.run(answer_common._score_chunks(ScriptedCall({}), [], "q", [], plan)) == {} + + bad_scores = ScriptedCall({"chunk_score": '[{"id":"c1","score":"oops"},{"score":2},"bad"]'}) + ctx = answer_common.ScoreContext("q", ["sub"], 1, 1, False, "fast") + assert asyncio.run(answer_common._score_groups_serial(bad_scores, [[{"id": "c1", "summary": "one"}]], ctx)) == {"c1": 0.0} + + parallel_scores = ScriptedCall({"chunk_score": ['[{"id":"c1","score":1}]', '[{"id":"c2","score":2}]']}) + parallel = asyncio.run( + answer_common._score_groups_parallel( + parallel_scores, + [[{"id": "c1", "summary": "one"}], [{"id": "c2", "summary": "two"}]], + answer_common.ScoreContext("q", ["sub"], 1, 2, False, "fast"), + ) + ) + assert parallel == {"c1": 1.0, "c2": 2.0} + + selector = ScriptedCall({"chunk_select": ['{"selected_index":"bad"}', '{"selected_index":99}']}) + runs = [{"c1": 1.0}, {"c1": 9.0}] + assert asyncio.run(answer_common._select_best_score_run(selector, [{"id": "c1", "summary": "one"}], runs, ctx)) == {"c1": 1.0} + assert asyncio.run(answer_common._select_best_score_run(selector, [{"id": "c1", "summary": "one"}], runs, ctx)) == {"c1": 1.0} + + chunks = [{"id": "c1", "text": "cpu: 95"}, {"id": "c2", "text": "ram: 20"}] + assert answer_common._keyword_hits(chunks, chunks[0], ["", " "]) == [] + assert answer_common._select_chunks([], {}, plan) == [] + selected = [chunks[0]] + assert answer_common._append_must_chunks(chunks, selected, None, 2) is False + assert answer_common._append_keyword_chunks([], [], ["cpu"], 2) is False + answer_common._append_ranked_chunks(chunks, selected, 2) + assert selected == chunks + + +def test_answer_post_and_post_ext_helpers() -> None: + """Cover metric-formatting, entity filtering, and payload helpers.""" + + assert answer_post._merge_fact_lines(["a", "b"], ["b", "c"]) == ["a", "b", "c"] + assert answer_post._strip_unknown_entities("Node titan-99 is hot. Namespace foo is full. Safe.", ["titan-99"], ["foo"]) == "Safe." + assert answer_post._strip_unknown_entities("", ["x"], ["y"]) == "" + assert answer_post._needs_evidence_guard("titan-99 has pressure", ["nodes_total: 2"]) is True + assert answer_post._filter_lines_by_keywords(["cpu: 95", "ram: 20"], ["cpu"], 2) == ["cpu: 95"] + assert answer_post._select_metric_line(["nodes_total: 22", "cpu: 95"], "How many nodes?", {"nodes"}) == "nodes_total: 22" + assert answer_post._format_direct_metric_line("nodes: total=22, ready=21") == "Atlas has 22 total nodes (ready=21)." + assert answer_post._format_direct_metric_line("nodes_total=22") == "Atlas has 22 total nodes." + assert answer_post._global_facts(["nodes_total: 2", "cluster_name: atlas", "other: x"]) + assert answer_post._has_keyword_overlap(["cpu: 95"], ["CPU"]) is True + assert answer_post._merge_tokens(["cpu"], ["ram"], ["cpu"]) == ["cpu", "ram"] + assert "atlas" in answer_post._extract_question_tokens("How is Atlas CPU load?") + assert "atlas" in answer_post._expand_tokens(["Atlas CPU"]) + assert answer_post._ensure_token_coverage(["cpu: 95"], ["cpu", "ram"], ["ram: 20"]) == ["ram: 20", "cpu: 95"] + assert answer_post._best_keyword_line(["cpu:95", "ram:20"], ["ram"]) == "ram:20" + assert answer_post._line_starting_with(["cpu:95"], "cpu:") == "cpu:95" + assert answer_post._non_rpi_nodes({"hardware_by_node": {"titan-01": "rpi5", "titan-02": "amd64"}}) == {"amd64": ["titan-02"]} + assert answer_post._format_hardware_groups({"amd64": ["titan-02"]}, "Non-Raspberry Pi nodes").startswith("Non-Raspberry Pi nodes:") + assert "Lexicon" in answer_post._lexicon_context({"lexicon": {"terms": [{"term": "Atlas", "meaning": "cluster"}], "aliases": {"pi": "rpi"}}}) + assert answer_post._parse_json_list("prefix [{\"id\": 1}, \"bad\"] suffix") == [{"id": 1}] + assert answer_post._scores_from_json({"confidence": "80"}).confidence == 80 + assert answer_post._default_scores().confidence == 60 + assert answer_post._style_hint({"answer_style": "insightful"}) == "insightful" + assert answer_post._needs_evidence_fix("No data available", {"needs_snapshot": True, "question_type": "metric"}) is True + assert answer_post._should_use_insight_guard({"answer_style": "insightful"}) is True + + guard_ok = ScriptedCall({"insight_guard": '{"ok": true}'}) + text = asyncio.run( + answer_post._apply_insight_guard( + InsightGuardInput("q", "reply", {"answer_style": "insightful"}, "ctx", answer_common._mode_plan(build_test_settings(), "smart"), guard_ok, ["cpu: 95"]) + ) + ) + assert text == "reply" + guard_fix = ScriptedCall({"insight_guard": '{"ok": false}', "insight_fix": "tightened"}) + assert ( + asyncio.run( + answer_post._apply_insight_guard( + InsightGuardInput("q", "reply", {"answer_style": "insightful"}, "ctx", answer_common._mode_plan(build_test_settings(), "smart"), guard_fix, ["cpu: 95"]) + ) + ) + == "tightened" + ) + + assert answer_post_ext._reply_matches_metric_facts("cpu 95", ["cpu: 95"], {"cpu"}) is True + assert answer_post_ext._reply_matches_metric_facts("no numbers", ["cpu: 95"], None) is False + assert answer_post_ext._needs_dedup("A. A. B.") is True + assert answer_post_ext._needs_dedup("Alpha. Alpha. Beta.") is True + assert answer_post_ext._needs_focus_fix("How many nodes?", "Based on the context, there are maybe some nodes. For more details...", {"question_type": "metric"}) is True + assert "atlas" in answer_post_ext._extract_keywords("What is Atlas now?", "Atlas", ["How many nodes?"], ["cpu"]) + assert answer_post_ext._allowed_nodes({"hardware_by_node": {"titan-01": "rpi5"}}) == ["titan-01"] + assert answer_post_ext._allowed_namespaces({"namespace_pods": [{"namespace": "synapse"}, "bad"]}) == ["synapse"] + assert answer_post_ext._find_unknown_nodes("titan-01 titan-99", ["titan-01"]) == ["titan-99"] + assert answer_post_ext._find_unknown_namespaces("namespace synapse namespace drift", ["synapse"]) == ["drift"] + assert answer_post_ext._needs_runbook_fix("See runbooks/nope.md", ["runbooks/yes.md"]) is True + assert answer_post_ext._needs_runbook_reference("where is the runbook?", ["runbooks/yes.md"], "") is True + assert answer_post_ext._best_runbook_match("runbooks/fixx.md", ["runbooks/fix.md"]) == "runbooks/fix.md" + assert answer_post_ext._resolve_path({"nodes": [{"name": "titan-01"}]}, "nodes[0].name") == "titan-01" + assert answer_post_ext._resolve_path({}, "line: cpu:95") == "cpu:95" + assert answer_post_ext._snapshot_id({"snapshot_id": "snap-1"}) == "snap-1" + payload = answer_post_ext._claims_to_payload([ClaimItem("c1", "claim", [EvidenceItem("nodes[0]", "why", value_at_claim="old")])]) + state = answer_post_ext._state_from_payload({"updated_at": 1.5, "claims": payload, "snapshot_id": "snap-1", "snapshot": {"nodes": 1}}) + assert state and state.snapshot_id == "snap-1" + + +def test_answer_post_edge_branches() -> None: + """Cover low-frequency formatting and fallback branches in post helpers.""" + + plan = answer_common._mode_plan(build_test_settings(), "smart") + + assert answer_post._strip_unknown_entities(" ", ["titan-01"], ["synapse"]) == " " + assert answer_post._needs_evidence_guard("Atlas runs on amd64 nodes.", ["nodes_total: 2"]) is True + assert answer_post._needs_evidence_guard("Atlas shows memorypressure.", ["nodes_total: 2"]) is True + + contradiction = asyncio.run( + answer_post._contradiction_decision( + ContradictionContext(ScriptedCall({"contradiction": '{"confidence":"bad","use_facts": false}'}), "q", "r", ["fact"], plan) + ) + ) + assert contradiction == {"use_facts": False, "confidence": 50} + + assert answer_post._filter_lines_by_keywords([], ["cpu"], 2) == [] + assert answer_post._filter_lines_by_keywords(["cpu: 95"], [], 2) == ["cpu: 95"] + assert answer_post._rank_metric_lines(["cpu high"], set(), 2) == [] + assert answer_post._select_metric_line([], "How many CPUs?", {"cpu"}) is None + assert answer_post._select_metric_line(["disk healthy"], "How many CPUs?", {"cpu"}) is None + assert answer_post._format_direct_metric_line("") == "" + assert answer_post._format_direct_metric_line("nodes:") == "nodes:" + assert answer_post._format_equals_metric("garbage") is None + assert answer_post._format_equals_metric("cpu=, ram=20") == "ram is 20." + assert answer_post._format_equals_metric("cpu=95, ram=20") == "cpu is 95; ram is 20." + assert answer_post._format_nodes_value("ready=2") is None + assert answer_post._format_nodes_value("total=3") == "Atlas has 3 total nodes." + assert answer_post._global_facts([]) == [] + assert answer_post._has_keyword_overlap(["cpu: 95"], []) is False + assert answer_post._has_keyword_overlap(["cpu: 95"], ["a"]) is False + assert answer_post._has_keyword_overlap(["cpu: 95"], ["ram"]) is False + assert answer_post._merge_tokens(["cpu", ""], ["ram"], ["cpu", "disk"]) == ["cpu", "ram", "disk"] + assert answer_post._expand_tokens([1, "a", "cpu-load"]) == ["cpu-load"] # type: ignore[list-item] + assert answer_post._ensure_token_coverage([], ["cpu"], ["cpu: 95"]) == [] + assert answer_post._ensure_token_coverage(["cpu: 95"], ["ram"], ["cpu: 95"]) == ["cpu: 95"] + assert answer_post._ensure_token_coverage(["cpu: 95"], ["cpu"], ["cpu: 95"]) == ["cpu: 95"] + assert answer_post._best_keyword_line(["cpu: 95"], []) is None + assert answer_post._best_keyword_line(["cpu: 95"], ["a"]) is None + assert answer_post._best_keyword_line(["disk: ok"], ["cpu"]) is None + assert answer_post._line_starting_with([], "cpu:") is None + assert answer_post._line_starting_with(["ram: 20"], "cpu:") is None + assert answer_post._non_rpi_nodes({"hardware_by_node": ["bad"]}) == {} # type: ignore[arg-type] + assert answer_post._non_rpi_nodes({"hardware_by_node": {"titan-01": "rpi5", "titan-02": 2}}) == {} # type: ignore[arg-type] + assert answer_post._format_hardware_groups({}, "Non-Raspberry Pi nodes") == "" + assert answer_post._lexicon_context([]) == "" # type: ignore[arg-type] + assert "alias pi -> rpi" in answer_post._lexicon_context({"lexicon": {"terms": ["bad"], "aliases": {"pi": "rpi"}}}) + assert answer_post._lexicon_context({"lexicon": {"terms": [{"term": "", "meaning": ""}], "aliases": {"": ""}}}) == "" + assert answer_post._parse_json_block("not-json", fallback={"fallback": True}) == {"fallback": True} + assert answer_post._parse_json_list("not-a-list") == [] + assert answer_post._coerce_int("nan", 7) == 7 + assert answer_post._needs_evidence_fix("", {"needs_snapshot": True}) is False + assert ( + asyncio.run( + answer_post._apply_insight_guard( + InsightGuardInput("q", "", {"answer_style": "insightful"}, "ctx", plan, ScriptedCall({}), []) + ) + ) + == "" + ) + + +def test_post_ext_and_retrieval_ext_edge_branches() -> None: + """Cover remaining branchy helpers in post_ext and retrieval_ext.""" + + plan = answer_common._mode_plan(build_test_settings(), "smart") + + assert answer_post_ext._reply_matches_metric_facts("Atlas is fine.", [], None) is True + assert answer_post_ext._reply_matches_metric_facts("cpu high", ["cpu: hot"], {"cpu"}) is False + assert answer_post_ext._needs_dedup("") is False + assert answer_post_ext._needs_dedup("One sentence only.") is False + assert answer_post_ext._needs_focus_fix("What is Atlas?", "Short answer.", {"question_type": "open_ended"}) is False + assert answer_post_ext._needs_focus_fix("How many pods?", "No data available.", {"question_type": "metric"}) is True + keywords = answer_post_ext._extract_keywords("the atlas", "show cpu", ["where now"], [1, "cpu"]) # type: ignore[list-item] + assert "cpu" in keywords + assert "the" not in keywords + assert answer_post_ext._allowed_nodes({"hardware_by_node": None}) == [] + assert answer_post_ext._allowed_namespaces({"namespace_pods": ["bad", {"namespace": ""}]}) == [] + assert answer_post_ext._find_unknown_nodes("", ["titan-01"]) == [] + assert answer_post_ext._find_unknown_nodes("plain text", ["titan-01"]) == [] + assert answer_post_ext._find_unknown_namespaces("", ["synapse"]) == [] + assert answer_post_ext._needs_runbook_fix("", ["runbooks/fix.md"]) is False + assert answer_post_ext._needs_runbook_fix("No runbook here.", ["runbooks/fix.md"]) is False + assert answer_post_ext._needs_runbook_reference("hello there", ["runbooks/fix.md"], "reply") is False + assert answer_post_ext._needs_runbook_reference("", ["runbooks/fix.md"], "reply") is False + assert answer_post_ext._needs_runbook_reference("where is the runbook", ["runbooks/fix.md"], "") is True + assert answer_post_ext._needs_runbook_reference("where is the runbook", ["runbooks/fix.md"], "Use runbooks/fix.md") is False + assert answer_post_ext._best_runbook_match("zzz", ["runbooks/fix.md"]) is None + assert answer_post_ext._resolve_path({"nodes": [1]}, "nodes..name") is None + assert answer_post_ext._resolve_path({"nodes": [1]}, "nodes[99]") is None + assert answer_post_ext._resolve_path({"nodes": {"bad": 1}}, "nodes[0]") is None + assert answer_post_ext._snapshot_id({}) is None + invalid_state = answer_post_ext._state_from_payload({"claims": ["bad", {"id": "", "claim": "x", "evidence": [{"path": ""}]}]}) + assert invalid_state is not None + assert invalid_state.claims == [] + + assert answer_retrieval_ext._parse_json_block("plain", fallback={"fallback": True}) == {"fallback": True} + assert asyncio.run(answer_retrieval_ext._select_best_candidate(ScriptedCall({}), "q", ["only"], plan, "pick")) == 0 + assert asyncio.run(answer_retrieval_ext._select_best_candidate(ScriptedCall({"pick": '{"best":"bad"}'}), "q", ["one", "two"], plan, "pick")) == 0 + assert asyncio.run(answer_retrieval_ext._select_best_list(ScriptedCall({}), "q", [], plan, "pick")) == [] + assert asyncio.run(answer_retrieval_ext._select_best_list(ScriptedCall({}), "q", [["cpu"]], plan, "pick")) == ["cpu"] + merged = asyncio.run( + answer_retrieval_ext._select_best_list(ScriptedCall({"pick": '{"best": 1}'}), "q", [[], ["cpu"], ["ram"]], plan, "pick") + ) + assert merged == ["cpu", "ram"] + assert asyncio.run(answer_retrieval_ext._extract_fact_types(ScriptedCall({"fact_types": '{}'}), "q", [], plan)) == [] + assert asyncio.run(answer_retrieval_ext._derive_signals(ScriptedCall({}), "q", [], plan)) == [] + assert asyncio.run(answer_retrieval_ext._scan_chunk_for_signals(ScriptedCall({}), "q", [], ["cpu: 95"], plan)) == [] + assert asyncio.run(answer_retrieval_ext._scan_chunk_for_signals(ScriptedCall({"chunk_scan": '{}'}), "q", ["cpu"], ["cpu: 95"], plan)) == [] + assert asyncio.run(answer_retrieval_ext._prune_metric_candidates(ScriptedCall({}), "q", [], plan, 1)) == [] + assert asyncio.run(answer_retrieval_ext._prune_metric_candidates(ScriptedCall({"fact_prune": '{}'}), "q", ["cpu: 95"], plan, 1)) == [] + assert asyncio.run(answer_retrieval_ext._select_fact_lines(ScriptedCall({}), "q", [], plan, 1)) == [] + assert asyncio.run(answer_retrieval_ext._select_fact_lines(ScriptedCall({"fact_select": '{}'}), "q", ["cpu: 95"], plan, 1)) == [] + + +def test_retrieval_ext_helpers() -> None: + """Cover retrieval helper parsing and selection branches.""" + + assert answer_retrieval_ext._parse_json_block("prefix {\"ok\": true} suffix", fallback={}) == {"ok": True} + assert "cpu" in answer_retrieval_ext._metric_key_tokens(["cpu_load: 95", "bad-line"]) + assert answer_retrieval_ext._dedupe_lines(["a", "a", "lexicon_x", "units: bad", "b"], limit=2) == ["a", "b"] + assert answer_retrieval_ext._collect_fact_candidates([{"text": "a\nb"}, {"text": None}], limit=3) == ["a", "b"] + + scripted = ScriptedCall( + { + "pick": '{"best": 2}', + "fact_types": ['{"fact_types": ["cpu", "ram"]}', '{"fact_types": ["cpu"]}'], + "fact_types_select": '{"best": 1}', + "signals": ['{"signals": ["cpu", "thermal"]}'], + "signals_select": '{"best": 1}', + "chunk_scan": ['{"lines": ["cpu: 95"]}'], + "chunk_scan_select": '{"best": 1}', + "fact_prune": ['{"lines": ["cpu: 95"]}'], + "fact_prune_select": '{"best": 1}', + "fact_select": ['{"lines": ["cpu: 95", "ram: 20"]}'], + "fact_select_best": '{"best": 1}', + } + ) + plan = answer_common._mode_plan(build_test_settings(), "smart") + idx = asyncio.run(answer_retrieval_ext._select_best_candidate(scripted, "q", ["one", "two"], plan, "pick")) + assert idx == 1 + assert asyncio.run(answer_retrieval_ext._select_best_list(scripted, "q", [[], ["cpu"]], plan, "pick")) == ["cpu"] + assert asyncio.run(answer_retrieval_ext._extract_fact_types(scripted, "q", ["cpu"], plan)) == ["cpu", "ram"] + assert asyncio.run(answer_retrieval_ext._derive_signals(scripted, "q", ["cpu"], plan)) == ["cpu", "thermal"] + assert asyncio.run(answer_retrieval_ext._scan_chunk_for_signals(scripted, "q", ["cpu"], ["cpu: 95"], plan)) == ["cpu: 95"] + assert asyncio.run(answer_retrieval_ext._prune_metric_candidates(scripted, "q", ["cpu: 95"], plan, 2)) == ["cpu: 95"] + assert asyncio.run(answer_retrieval_ext._select_fact_lines(scripted, "q", ["cpu: 95", "ram: 20"], plan, 2)) == ["cpu: 95", "ram: 20"] + + +def test_answer_engine_helper_methods(tmp_path: Path) -> None: + """Exercise direct engine helpers that the top-level flow rarely hits.""" + + settings = replace(build_test_settings(), state_db_path=str(tmp_path / "state.db")) + llm = SimpleNamespace(chat=lambda messages, model=None: asyncio.sleep(0, result="stock")) # type: ignore[call-arg] + engine = answer_engine.AnswerEngine( + settings, + llm, # type: ignore[arg-type] + KnowledgeBase(""), + SimpleNamespace(), # type: ignore[arg-type] + ) + stock = asyncio.run(engine._answer_stock("What is Atlas?")) + assert stock.reply == "stock" + + scripted = ScriptedCall( + { + "synth": ["draft-one", "draft-two", "single-draft"], + "draft_select": '{"best": 2}', + "score": '{"confidence": 90, "relevance": 80, "satisfaction": 70, "hallucination_risk": "low"}', + "claim_map": '{"claims":[{"id":"c1","claim":"Atlas is busy","evidence":[{"path":"nodes[0].name","reason":"hot"}]}]}', + "dedup": "deduped", + } + ) + plan = replace(answer_common._mode_plan(settings, "smart"), drafts=2, parallelism=2, use_scores=True) + assert asyncio.run(engine._synthesize_answer("q", ["a", "b"], "ctx", {"question_type": "metric"}, plan, scripted)) == "draft-two" + assert asyncio.run(engine._synthesize_answer("q", [], "ctx", {"question_type": "metric"}, plan, scripted)) == "single-draft" + scores = asyncio.run(engine._score_answer("q", "reply", plan, scripted)) + assert scores.confidence == 90 + claims = asyncio.run(engine._extract_claims("q", "reply", {"nodes": [{"name": "titan-01"}]}, ["nodes[0].name: titan-01"], scripted)) + assert claims and claims[0].id == "c1" + assert asyncio.run(engine._dedup_reply("Alpha. Alpha. Beta.", plan, scripted, "dedup")) == "deduped" + assert asyncio.run(engine._dedup_reply("Alpha only.", plan, scripted, "dedup")) == "Alpha only." + + contradiction = asyncio.run( + answer_post._contradiction_decision( + ContradictionContext(scripted, "q", "reply", ["cpu:95"], plan), + attempts=2, + ) + ) + assert contradiction["confidence"] == 50 + + +def test_answer_engine_edge_fallbacks(tmp_path: Path) -> None: + """Cover engine fallbacks that only show up on malformed helper output.""" + + settings = replace(build_test_settings(), state_db_path=str(tmp_path / "state.db")) + engine = answer_engine.AnswerEngine( + settings, + SimpleNamespace(chat=lambda *_args, **_kwargs: asyncio.sleep(0, result="unused")), # type: ignore[arg-type] + KnowledgeBase(""), + SimpleNamespace(), # type: ignore[arg-type] + ) + plan = replace(answer_common._mode_plan(settings, "smart"), drafts=2, parallelism=1, use_scores=False) + + bad_select = ScriptedCall({"synth": ["draft-one", "draft-two"], "draft_select": '{"best": 99}'}) + assert asyncio.run(engine._synthesize_answer("q", ["a", "b"], "ctx", {"question_type": "metric"}, plan, bad_select)) == "draft-one" + assert asyncio.run(engine._score_answer("q", "reply", plan, bad_select)).confidence == 60 + assert asyncio.run(engine._extract_claims("q", "", {"nodes": []}, [], bad_select)) == [] + + malformed_claims = ScriptedCall( + { + "claim_map": '{"claims":[{"id":"c1","claim":"hot","evidence":["bad",{"path":"","reason":"nope"}]},{"claim":"","evidence":[{"path":"nodes[0].name","reason":"why"}]}]}', + "select_claims": '{"claim_ids":"bad"}', + } + ) + claims = asyncio.run(engine._extract_claims("q", "reply", {"nodes": [{"name": "titan-01"}]}, [], malformed_claims)) + assert claims == [] + assert asyncio.run(engine._select_claims("q", [ClaimItem("c1", "claim", [EvidenceItem("nodes[0].name", "why")])], plan, malformed_claims)) == [] + + +def test_factsheet_edge_paths() -> None: + """Cover low-frequency factsheet selection and heuristic branches.""" + + assert answer_factsheet._is_plain_math_question("") is False + + fact_lines = answer_factsheet._quick_fact_sheet_lines( + "where is the titan runbook", + [""], + [ + "", + "x" * 300, + "KB File: notes.md", + "runbook alpha", + "runbook beta", + "titan-01 runs hot", + "amd64 nodes are available", + "runbook gamma", + ], + limit=6, + ) + assert "runbook alpha" in fact_lines + assert "titan-01 runs hot" in fact_lines + assert all(not line.startswith("KB File:") for line in fact_lines) + + assert ( + answer_factsheet._quick_fact_sheet_heuristic_answer( + "which nodes are not ready?", + ["noise first", "nodes_total:2,ready:1,not_ready:1"], + ) + == "The latest snapshot shows 1 not-ready nodes (1 ready out of 2 total)." + ) + assert answer_factsheet._quick_fact_sheet_heuristic_answer("how many ready nodes?", ["noise first"]) == "" + + +def test_snapshot_builder_core_a_edge_paths(monkeypatch: pytest.MonkeyPatch) -> None: + """Cover cached snapshot fallback and summary-builder edge branches.""" + + settings = replace(build_test_settings(), ariadne_state_url="http://snapshot") + provider = SnapshotProvider(settings) + provider._cache = {"cached": True} + + def broken_get(*_args: Any, **_kwargs: Any) -> Any: + raise httpx.HTTPError("boom") + + monkeypatch.setattr("atlasbot.snapshot.builder.httpx.get", broken_get) + assert provider.get() == {"cached": True} + + assert core_a._node_usage_top([{}, {"node": "titan-01", "value": "bad"}, {"node": "titan-02", "value": 3}]) == { + "node": "titan-02", + "value": 3.0, + } + + merged: dict[str, Any] = {} + core_a._merge_cluster_fields(merged, {"signals": [], "profiles": "bad"}, {"signals": list, "profiles": dict}) + assert merged == {"signals": []} + + assert core_a._build_nodes({}) == {} + assert core_a._build_hardware([]) == {} + assert core_a._build_hardware([{}, {"name": "titan-01", "hardware": "rpi5"}]) == {"hardware": {"rpi5": ["titan-01"]}} + assert core_a._build_hardware_by_node([{}, {"name": "titan-01", "hardware": "rpi5"}]) == {"hardware_by_node": {"titan-01": "rpi5"}} + assert core_a._build_hardware_usage({}, {"titan-01": "rpi5"}) == {} + assert core_a._build_hardware_usage({"node_load": []}, {"titan-01": "rpi5"}) == {} + + usage = core_a._build_hardware_usage( + {"node_load": [{}, {"node": "", "cpu": 1}, {"node": "titan-01", "load_index": 2, "cpu": 50}]}, + {"titan-01": "rpi5"}, + ) + assert usage["hardware_usage_avg"][0]["cpu"] == 50 + + assert core_a._build_node_facts([]) == {} + facts = core_a._build_node_facts( + [{}, {"is_worker": True, "roles": ["db", "", 1], "arch": "amd64", "os": "linux", "kubelet": "v1", "kernel": "k", "container_runtime": "c"}] + ) + assert facts["node_role_counts"]["worker"] == 1 + + assert core_a._build_node_taints([{}, {"name": ""}, {"name": "titan-01", "taints": ["bad", {"key": "dedicated", "effect": "NoSchedule"}]}]) == { + "node_taints": {"dedicated:NoSchedule": ["titan-01"]} + } + + headroom = core_a._build_root_disk_headroom( + {"node_usage": {"disk": [{}, {"node": "titan-01", "value": "bad"}, {"node": "titan-02", "value": 80}]}} + ) + assert headroom["root_disk_low_headroom"][0]["node"] == "titan-02" + + assert core_a._build_capacity({}) == {} + assert core_a._build_workload_health({"workloads_health": {"deployments": {}, "statefulsets": {}, "daemonsets": []}}) == {} + assert core_a._build_postgres({}) == {} + + +def test_snapshot_builder_format_c_edge_paths() -> None: + """Cover summary text formatter branches that only trigger on sparse data.""" + + lines: list[str] = [] + format_c._append_signals(lines, {}) + format_c._append_profiles(lines, {}) + format_c._append_cluster_watchlist(lines, {}) + assert lines == [] + + format_c._append_signals( + lines, + { + "signals": [ + "bad", + {"scope": "node", "target": "titan-01", "metric": "cpu", "current": 95, "delta_pct": 10, "severity": "warn"}, + ] + }, + ) + format_c._append_profiles( + lines, + { + "profiles": { + "nodes": ["bad", {"node": "titan-01", "load_index": 0.9, "cpu": 95, "ram": 70, "pods_total": 5, "hardware": "rpi5"}], + "namespaces": ["bad", {"namespace": "synapse", "pods_total": 4, "cpu_usage": 80, "mem_usage": 70, "primary_node": "titan-01"}], + "workloads": ["bad", {"namespace": "synapse", "workload": "app", "pods_total": 2, "pods_running": 2, "primary_node": "titan-01"}], + } + }, + ) + format_c._append_units_windows(lines, {"metrics": {}}) + format_c._append_node_load_summary( + lines, + { + "hardware_by_node": {"titan-01": "rpi5"}, + "node_load_summary": { + "top": ["bad", {"node": "titan-01", "load_index": 1.5, "cpu": 90, "ram": 80, "io": 1024, "net": 2048, "pods_total": 7}], + "outliers": ["bad", {"node": ""}, {"node": "titan-02"}], + }, + }, + ) + format_c._append_hardware_usage( + lines, + { + "hardware_usage_avg": [ + "bad", + {"hardware": "", "cpu": 1}, + {"hardware": "rpi5", "load_index": 1.5, "cpu": 90, "ram": 80, "io": 1024, "net": 2048}, + {"hardware": "amd64", "load_index": 2.5, "cpu": 95, "ram": 70, "io": 4096, "net": 8192}, + ] + }, + ) + format_c._append_cluster_watchlist(lines, {"cluster_watchlist": ["not_ready_nodes=1"]}) + format_c._append_baseline_deltas( + lines, + { + "baseline_deltas": { + "nodes": {"cpu": ["bad", {"node": "titan-01", "delta": 10, "severity": "warn"}]}, + "namespaces": {"cpu": [{"namespace": "synapse", "delta": 12}]}, + } + }, + ) + format_c._append_pod_issue_summary( + lines, + { + "pod_issue_summary": { + "waiting_reasons_top": ["bad", {"reason": "ImagePullBackOff", "count": 2}], + "phase_reasons_top": [{"reason": "CrashLoopBackOff", "count": 1}], + "namespace_issue_top": {"cpu": ["bad", {"namespace": "synapse", "value": 95}, {"namespace": "", "value": 1}]}, + } + }, + ) + + watchlist = format_c._build_cluster_watchlist( + { + "nodes_summary": {"not_ready": 1}, + "pressure_nodes": {"names": ["titan-02"]}, + "pod_issues": {"pending_over_15m": 2}, + "workloads_health": {"deployments": {"not_ready": 1}, "statefulsets": {"not_ready": 0}, "daemonsets": {"not_ready": 1}}, + "flux": {"not_ready": 1}, + "pvc_usage_top": [{"value": 95}], + } + ) + assert "cluster_watchlist" in watchlist + + assert format_c._capacity_ratio_parts(["bad", {"namespace": "synapse", "cpu_usage_ratio": 1.2, "cpu_usage": 2, "cpu_requests": 1}], "cpu_usage_ratio", "cpu_usage", "cpu_requests") == [ + "synapse=1.2 (usage=2 req=1)" + ] + assert format_c._capacity_headroom_parts(["bad", {"namespace": "synapse", "headroom": 12.5}]) == ["synapse=12.5"] + + cap_lines: list[str] = [] + format_c._append_namespace_capacity_summary( + cap_lines, + { + "namespace_capacity_summary": { + "cpu_ratio_top": [{"namespace": "synapse", "cpu_usage_ratio": 1.2, "cpu_usage": 2, "cpu_requests": 1}], + "mem_ratio_top": [{"namespace": "synapse", "mem_usage_ratio": 1.1, "mem_usage": 3, "mem_requests": 2}], + "cpu_headroom_low": [{"namespace": "synapse", "headroom": 12.5}], + "mem_headroom_low": [{"namespace": "synapse", "headroom": 8.5}], + "cpu_overcommitted": 1, + "mem_overcommitted": 0, + "cpu_overcommitted_names": ["synapse", ""], + "mem_overcommitted_names": ["synapse"], + } + }, + ) + assert any(line.startswith("namespace_cpu_ratio_top:") for line in cap_lines) + + format_c._append_workloads_by_namespace( + lines, + { + "workloads": [ + "bad", + {"namespace": "", "workload": "skip"}, + {"namespace": "synapse", "workload": "app", "pods_total": 2, "primary_node": "titan-01"}, + {"namespace": "synapse", "workload": "db", "pods_total": 1}, + ] + }, + ) + format_c._append_lexicon( + lines, + {"lexicon": {"terms": ["bad", {"term": "Atlas", "meaning": "cluster"}], "aliases": {"pi": "rpi", "": ""}}}, + ) + format_c._append_cross_stats( + lines, + { + "cross_stats": { + "node_metric_top": ["bad", {"metric": "cpu", "node": "titan-01", "value": 95, "cpu": 95, "ram": 80, "net": 12, "io": 9, "pods_total": 5}], + "namespace_metric_top": ["bad", {"metric": "cpu", "namespace": "synapse", "value": 95, "cpu_ratio": 1.2, "mem_ratio": 1.1, "pods_total": 4}], + "pvc_top": ["bad", {"namespace": "synapse", "pvc": "data", "used_percent": 90}], + } + }, + ) + + assert any(line.startswith("signals:") for line in lines) + assert any(line.startswith("units: cpu_pct") for line in lines) + assert any(line.startswith("hardware_usage_top:") for line in lines) + assert any(line.startswith("namespace_issue_top_cpu:") for line in lines) + assert any(line.startswith("lexicon_term: Atlas") for line in lines) + assert any(line.startswith("cross_pvc_usage: synapse/data") for line in lines) + + +def test_snapshot_builder_format_b_edge_paths() -> None: + """Cover sparse-data and fallback branches in the mid-level snapshot formatters.""" + + lines: list[str] = [] + format_b._append_longhorn(lines, {}) + format_b._append_namespace_usage(lines, {}) + format_b._append_job_failures(lines, {}) + format_b._append_jobs(lines, {}) + format_b._append_postgres(lines, {}) + format_b._append_hottest(lines, {}) + format_b._append_workloads(lines, {}) + format_b._append_topology(lines, {}) + format_b._append_flux(lines, {}) + assert lines == [] + + format_b._append_namespace_metric_series(lines, "namespace_cpu_top", ["bad"], format_b._format_float) + assert lines == [] + + format_b._append_longhorn( + lines, + { + "longhorn": { + "total": 3, + "unhealthy_count": 1, + "by_state": {"attached": 2}, + "by_robustness": {"healthy": 2}, + "unhealthy": ["bad", {"name": "vol-1", "state": "detached", "robustness": "degraded"}], + } + }, + ) + format_b._append_longhorn( + lines, + { + "longhorn": { + "total": 4, + "attached_count": 2, + "detached_count": 1, + "degraded_count": 1, + } + }, + ) + format_b._append_namespace_usage( + lines, + { + "metrics": { + "namespace_cpu_top": ["bad", {"metric": {"namespace": "synapse"}, "value": 95}], + "namespace_mem_top": [{"metric": {"namespace": "synapse"}, "value": 1024}], + } + }, + ) + format_b._append_namespace_requests( + lines, + { + "metrics": { + "namespace_cpu_requests_top": [{"metric": {"namespace": "synapse"}, "value": 2}], + "namespace_mem_requests_top": [{"metric": {"namespace": "synapse"}, "value": 2048}], + } + }, + ) + format_b._append_namespace_io_net( + lines, + { + "metrics": { + "namespace_net_top": [{"metric": {"namespace": "synapse"}, "value": 2048}], + "namespace_io_top": [{"metric": {"namespace": "synapse"}, "value": 1024}], + } + }, + ) + format_b._append_pod_usage( + lines, + { + "metrics": { + "pod_cpu_top": ["bad", {"metric": {"namespace": "synapse", "pod": "app"}, "value": 95}], + "pod_cpu_top_node": ["bad", {"metric": {"namespace": "synapse", "pod": "app", "node": "titan-01"}, "value": 90}], + "pod_mem_top": ["bad", {"metric": {"namespace": "synapse", "pod": "app"}, "value": 1024}], + "pod_mem_top_node": ["bad", {"metric": {"namespace": "synapse", "pod": "app", "node": "titan-01"}, "value": 2048}], + } + }, + ) + format_b._append_restarts(lines, {"metrics": {}}) + format_b._append_restarts( + lines, + { + "metrics": { + "top_restarts_1h": ["bad", {"metric": {"namespace": "synapse", "pod": "app"}, "value": [0, 3]}], + "restart_namespace_top": [{"metric": {"namespace": "synapse"}, "value": 3}], + } + }, + ) + format_b._append_job_failures( + lines, + {"metrics": {"job_failures_24h": ["bad", {"metric": {"namespace": "batch", "job_name": "cleanup"}, "value": 2}]}} + ) + format_b._append_jobs( + lines, + { + "jobs": { + "totals": {"total": 4, "active": 1, "failed": 1, "succeeded": 2}, + "failing": ["bad", {"namespace": "batch", "job": "cleanup", "failed": 2, "age_hours": 1.5}], + "active_oldest": ["bad", {"namespace": "batch", "job": "sync", "age_hours": 2.5}], + } + }, + ) + format_b._append_postgres( + lines, + { + "postgres": { + "used": 4, + "max": 20, + "hottest_db": "atlas", + "by_db": ["bad", {"metric": {"datname": "atlas"}, "value": [0, 4]}], + } + }, + ) + format_b._append_hottest( + lines, + { + "hardware_by_node": {"titan-01": "rpi5"}, + "hottest": {"cpu": {"node": "titan-01", "value": 95}, "net": {"node": "titan-01", "value": 2048}, "bad": "skip"}, + }, + ) + format_b._append_hottest(lines, {"hottest": {"ram": {"node": "titan-02", "value": 88}}}) + format_b._append_workloads( + lines, + {"workloads": ["bad", {"namespace": "synapse", "workload": "app", "pods_total": 3, "primary_node": "titan-01"}]}, + ) + format_b._append_workloads(lines, {"workloads": ["bad"]}) + format_b._append_topology( + lines, + { + "topology": { + "nodes": ["bad", {"node": "titan-01", "workloads_top": [("app", 3), ("db", 1)]}], + "workloads": ["bad", {"namespace": "synapse", "workload": "app", "nodes_top": [("titan-01", 3)]}], + } + }, + ) + format_b._append_flux( + lines, + { + "flux": { + "not_ready": 1, + "items": ["bad", {"name": "kustomize", "namespace": "flux-system", "reason": "stalled", "suspended": True}], + } + }, + ) + + assert any(line.startswith("longhorn: total=3") for line in lines) + assert any(line.startswith("namespace_cpu_top:") for line in lines) + assert "restarts_1h_top: none" in lines + assert any(line.startswith("restarts_1h_top: synapse/app=3") for line in lines) + assert any(line.startswith("jobs_failing_top:") for line in lines) + assert any(line.startswith("postgres_connections_by_db: atlas=4") for line in lines) + assert any(line.startswith("flux_not_ready_items: flux-system/kustomize") for line in lines) + assert format_b._format_jobs_totals({}) == "" + assert format_b._format_jobs_failing({}) == "" + assert format_b._format_jobs_active_oldest({}) == "" + + +def test_snapshot_builder_format_a_edge_paths() -> None: + """Cover sparse-data, fallback, and invalid-entry branches in base snapshot formatters.""" + + assert format_a._format_float("bad") == "bad" + assert format_a._format_rate_bytes("bad") == "bad" + assert format_a._format_rate_bytes(12).endswith("B/s") + assert format_a._format_bytes("bad") == "bad" + assert format_a._format_kv_map({}) == "" + assert format_a._format_names([]) == "" + assert format_a._format_pod_issue_counts({}) == "" + assert format_a._format_pod_issue_top({"items": ["bad", {"namespace": "", "pod": "api"}]}) == "" + assert format_a._format_pod_pending_oldest({"pending_oldest": ["bad", {"namespace": "synapse", "pod": "api"}]}) == "" + assert format_a._format_pod_waiting_reasons({}) == "" + assert format_a._format_pod_pending_over_15m({"pending_over_15m": "bad"}) == "" + + lines: list[str] = [] + format_a._append_nodes(lines, {"nodes": {"total": 2, "ready": 1, "not_ready": None}}) + format_a._append_hardware(lines, {"hardware": {"rpi5": ["titan-01", ""], "skip": "bad"}}) + format_a._append_hardware_groups(lines, {"hardware": {"rpi5": ["titan-01"], "skip": "bad"}}) + format_a._append_node_ages(lines, {"node_ages": ["bad", {"name": "titan-01", "age_hours": "oops"}, {"name": "titan-02", "age_hours": 2.5}]}) + format_a._append_node_taints(lines, {"node_taints": {"gpu": ["titan-22"], "skip": "bad"}}) + format_a._append_node_facts(lines, {"node_arch_counts": {}, "node_os_counts": {"linux": 2}}) + format_a._append_pressure(lines, {"pressure_nodes": {"disk": ["titan-10", ""], "memory": []}}) + format_a._append_pods(lines, {"pods": {"running": 3, "pending": 1, "failed": 0, "succeeded": 2}}) + format_a._append_capacity(lines, {"capacity": {"cpu": "bad", "allocatable_cpu": 3, "mem_bytes": 512, "allocatable_mem_bytes": 2048, "pods": 10}}) + format_a._append_namespace_pods(lines, {"namespace_pods": [{"namespace": "", "pods_total": 1}, {"namespace": "synapse", "pods_total": 3, "pods_running": 2}]}) + format_a._append_namespace_nodes(lines, {"namespace_nodes": [{"namespace": "synapse", "pods_total": 3, "primary_node": "titan-01"}, {"namespace": "", "pods_total": 1}]}) + format_a._append_node_pods(lines, {"node_pods": ["bad", {"node": "titan-01", "pods_total": "oops"}, {"node": "titan-02", "pods_total": 4, "namespaces_top": [("synapse", 3)]}]}) + format_a._append_pod_issues( + lines, + { + "pod_issues": { + "counts": {"Failed": 1, "Pending": 2}, + "items": ["bad", {"namespace": "", "pod": "skip"}, {"namespace": "synapse", "pod": "api", "phase": "Pending", "restarts": 1}], + "pending_oldest": ["bad", {"namespace": "synapse", "pod": "api", "age_hours": 1.5, "reason": "ImagePullBackOff"}], + "waiting_reasons": {"CrashLoopBackOff": 3}, + "pending_over_15m": "bad", + } + }, + ) + format_a._append_workload_health( + lines, + {"workloads_health": {"deployments": {"not_ready": 1}, "statefulsets": {"not_ready": 0}, "daemonsets": {"not_ready": 2}}}, + ) + format_a._append_node_usage_stats(lines, {"metrics": {"node_usage_stats": {"cpu": {"avg": 91}, "net": {"avg": 2048}, "disk": {}}}}) + format_a._append_events(lines, {"events": {"warnings_total": 2, "warnings_by_reason": {"BackOff": 2, "Failed": 1}}}) + format_a._append_events(lines, {"events": {"warnings_total": 0, "warnings_by_reason": {}}}) + format_a._append_pvc_usage(lines, {"pvc_usage_top": ["bad", {"metric": {"namespace": "synapse", "persistentvolumeclaim": "data"}, "value": 88}]}) + format_a._append_root_disk_headroom(lines, {"root_disk_low_headroom": ["bad", {"node": "titan-01", "headroom_pct": 12.5}]}) + + assert any(line.startswith("nodes: total=2, ready=1, not_ready=0") for line in lines) + assert any(line.startswith("hardware: rpi5=2") for line in lines) + assert any(line.startswith("node_age_top: titan-02=2.5h") for line in lines) + assert any(line.startswith("node_taints: gpu=1") for line in lines) + assert any(line.startswith("node_os: linux=2") for line in lines) + assert any(line.startswith("node_pressure: disk=2") for line in lines) + assert any(line.startswith("namespace_nodes_top: synapse=3") for line in lines) + assert any(line.startswith("node_pods_top: titan-02=4") for line in lines) + assert any(line.startswith("pod_issues: Failed=1; Pending=2") for line in lines) + assert any(line.startswith("pods_pending_oldest: synapse/api=1.5h") for line in lines) + assert any(line.startswith("workloads_not_ready: deployments=1") for line in lines) + assert any(line.startswith("node_usage_avg: cpu=91") for line in lines) + assert "warnings: total=0" in lines + assert any(line.startswith("pvc_usage_top: synapse/data=88") for line in lines) + assert any(line.startswith("root_disk_low_headroom: titan-01=12.5%") for line in lines) + + +def test_retrieval_helper_edge_paths() -> None: + """Cover fallback-heavy retrieval helpers and metric-selection branches.""" + + assert answer_retrieval._metric_ctx_values({}) == ([], "", [], [], set()) + assert answer_retrieval._extract_metric_keys(["no colon", "bad key: value", "nodes_total: 2", "nodes_total: 3"]) == ["nodes_total"] + assert answer_retrieval._token_variants(set()) == set() + assert "policy" in answer_retrieval._token_variants({"policies"}) + assert answer_retrieval._parse_key_list('[1, "nodes_total", "nodes_total"]', ["nodes_total"], 1) == ["nodes_total"] + assert answer_retrieval._chunk_ids_for_keys([{"id": "c1", "text": "nodes_total: 2"}], []) == [] + assert answer_retrieval._chunk_ids_for_keys([{"id": "c1", "text": ""}, {"id": "c2", "text": "nodes_total: 2"}], ["nodes_total"]) == ["c2"] + assert answer_retrieval._filter_metric_keys([], {"cpu"}) == [] + assert answer_retrieval._filter_metric_keys(["nodes_total"], {"ram"}) == [] + assert not answer_retrieval._metric_key_overlap([], {"cpu"}) + assert not answer_retrieval._metric_key_overlap(["nodes_total"], {"ram"}) + assert answer_retrieval._lines_for_metric_keys([], ["nodes_total"]) == [] + assert answer_retrieval._lines_for_metric_keys(["nodes_total: 2", "namespace_cpu_top: synapse=95"], ["nodes_total", "namespace_cpu_top"], max_lines=1) == ["nodes_total: 2"] + assert answer_retrieval._merge_metric_keys(["a"], ["a", "b"], 1) == ["a"] + assert answer_retrieval._merge_fact_lines(["a", "a"], ["a", "b"]) == ["a", "b"] + assert answer_retrieval._expand_hottest_line("") == [] + assert answer_retrieval._expand_hottest_line("other: cpu=x") == [] + assert answer_retrieval._expand_hottest_line("hottest: badpart") == [] + assert answer_retrieval._expand_hottest_line("hottest: cpu=titan-01 [rpi5] (95%)") == ["hottest_cpu_node: titan-01 [rpi5] (95%)"] + assert answer_retrieval._expand_hottest_line("hottest: ram=titan-02 (80%)") == ["hottest_ram_node: titan-02 (80%)"] + assert answer_retrieval._has_token("disk i/o busy", "io") + assert not answer_retrieval._has_token("", "cpu") + assert answer_retrieval._hotspot_evidence({"hottest": {}}) == [] + + hotspot_lines = answer_retrieval._hotspot_evidence( + { + "hottest": {"cpu": {"node": "titan-01", "value": 95}, "skip": "bad"}, + "hardware_by_node": {"titan-01": "rpi5"}, + "node_pods_top": ["bad", {"node": "titan-01", "namespaces_top": [("synapse", 3), ("db", 1)]}, {"node": "titan-02", "namespaces_top": ["bad"]}], + } + ) + assert any(line.startswith("hotspot.cpu: node=titan-01 class=rpi5 value=95.00") for line in hotspot_lines) + assert answer_retrieval._hotspot_evidence({"hottest": {"ram": {"value": 50}}, "node_pods_top": []}) == [] + + plan = answer_common._mode_plan(build_test_settings(), "smart") + chunks = [{"id": "c1", "text": "namespace_cpu_top: synapse=95\nnodes_total: 2"}] + ctx = { + "summary_lines": ["namespace_cpu_top: synapse=95", "nodes_total: 2"], + "question": "which namespace has the highest cpu", + "sub_questions": ["which namespace"], + "keywords": ["namespace", "cpu"], + "keyword_tokens": ["namespace", "cpu"], + } + + scripted = ScriptedCall({"metric_keys": "{}", "metric_keys_validate": '{"missing":["namespace_cpu_top"]}'}) + selected, chunk_ids = asyncio.run(answer_retrieval._select_metric_chunks(scripted, ctx, chunks, plan)) + assert selected == ["namespace_cpu_top"] + assert chunk_ids == ["c1"] + + no_overlap = ScriptedCall({"metric_keys": '{"keys":["nodes_total"]}', "metric_keys_validate": '{"missing":[]}'}) + selected, _ = asyncio.run(answer_retrieval._select_metric_chunks(no_overlap, ctx, chunks, plan)) + assert selected == ["namespace_cpu_top"] + + no_keys = ScriptedCall({"metric_keys": "{}"}) + assert asyncio.run(answer_retrieval._select_metric_chunks(no_keys, {"summary_lines": ["bad key: value"], "question": "cpu", "sub_questions": [], "keywords": [], "keyword_tokens": []}, chunks, plan)) == ([], []) + assert asyncio.run(answer_retrieval._select_metric_chunks(no_keys, {"summary_lines": ["nodes_total: 2"], "question": "mystery", "sub_questions": [], "keywords": [], "keyword_tokens": []}, chunks, plan)) == ([], []) + assert asyncio.run(answer_retrieval._select_metric_chunks(scripted, {"summary_lines": [], "question": "cpu"}, chunks, plan)) == ([], []) + assert asyncio.run(answer_retrieval._validate_metric_keys(scripted, {"question": "cpu", "sub_questions": [], "selected": []}, [], plan)) == [] + assert asyncio.run(answer_retrieval._gather_limited([], 2)) == [] + + +def test_spine_helper_edge_paths(monkeypatch: pytest.MonkeyPatch) -> None: + """Cover fallback and summary-derived spine branches.""" + + assert answer_spine._join_context(["alpha", "", "beta"]) == "alpha\nbeta" + assert answer_spine._format_metric_value(True) == "true" + assert answer_spine._format_metric_value(2) == "2" + assert answer_spine._format_metric_value(2.5) == "2.5" + assert answer_spine._format_metric_value(object()).startswith(" None: + """Drive the expensive post-processing branches with a deterministic engine double.""" + + plan = replace(answer_common._mode_plan(build_test_settings(), "smart"), use_critic=False, use_gap=False) + observed: list[tuple[str, str]] = [] + scripted = ScriptedCall( + { + "runbook_select": ['{"path":"runbooks/fix.md"}', "{}"], + "evidence_fix": "namespace ghost on titan-99 uses runbooks/fix-md.", + "evidence_fix_enforce": "namespace ghost on titan-99 uses runbooks/fix-md.", + "metric_direct": "no digits here", + "runbook_enforce": [ + "Non-Raspberry Pi nodes: amd64 (titan-02). See runbooks/fix-md.", + "amd64 stays separate. This does not provide the exact value.", + ], + "contradiction": '{"use_facts": true, "confidence": 90}', + "evidence_guard": "This does not provide the exact value.", + "focus_fix": "No exact value provided.", + } + ) + + class FinalizeEngine: + async def _synthesize_answer(self, *args: Any) -> str: + return "namespace ghost on titan-99 uses runbooks/fix-md." + + async def _dedup_reply(self, reply: str, _plan: Any, _call_llm: Any, tag: str) -> str: + assert tag == "dedup" + return reply + + async def _score_answer(self, _question: str, _reply: str, _plan: Any, _call_llm: Any) -> AnswerScores: + return AnswerScores(70, 71, 72, "medium") + + async def _extract_claims(self, _question: str, _reply: str, _summary: dict[str, Any], _facts_used: list[str], _call_llm: Any) -> list[ClaimItem]: + return [] + + reply, scores, claims = asyncio.run( + answer_workflow_post.finalize_answer( + engine=FinalizeEngine(), + call_llm=scripted, + normalized="Which nodes are not raspberry and which runbook should I use?", + subanswers=["draft"], + context="ctx", + classify={"question_type": "metric", "needs_snapshot": True, "answer_style": "direct"}, + plan=plan, + summary={"hardware_by_node": {"titan-01": "rpi5", "titan-02": "amd64"}}, + summary_lines=["hardware_nodes: rpi5=(titan-01); amd64=(titan-02)", "namespace_cpu_top: synapse=95", "nodes_total: 2"], + metric_facts=["nodes_total: 2"], + key_facts=["namespace_cpu_top: synapse=95"], + facts_used=["namespace_cpu_top: synapse=95"], + allowed_nodes=["titan-01", "titan-02"], + allowed_namespaces=["synapse"], + runbook_paths=["runbooks/fix.md"], + lowered_question="which nodes are not raspberry and which runbook should i use?", + force_metric=True, + keyword_tokens=["namespace"], + question_tokens=["namespace", "raspberry", "runbook"], + snapshot_context="ClusterSnapshot:\nnamespace_cpu_top: synapse=95", + observer=lambda stage, note: observed.append((stage, note)), + mode="smart", + metric_keys=["nodes_total"], + ) + ) + + assert reply == "Latest metrics: nodes_total: 2." + assert scores.confidence == 70 + assert claims == [] + stages = [stage for stage, _note in observed] + assert "evidence_fix" in stages + assert "runbook_enforce" in stages + assert "evidence_guard" in stages + assert "focus_fix" in stages + + +def test_run_answer_empty_stock_and_budget_paths(monkeypatch: pytest.MonkeyPatch) -> None: + """Cover early returns and the pre-call time-budget failure path.""" + + settings = build_test_settings() + + class EmptySnapshot: + def get(self) -> dict[str, Any]: + return {} + + class EmptyKb: + def summary(self) -> str: + return "" + + def runbook_titles(self, limit: int = 6) -> str: + del limit + return "" + + def runbook_paths(self, limit: int = 10) -> list[str]: + del limit + return [] + + class MinimalEngine: + def __init__(self) -> None: + self._settings = settings + self._snapshot = EmptySnapshot() + self._kb = EmptyKb() + self._llm = SimpleNamespace(chat=lambda *args, **kwargs: None) + + async def _answer_stock(self, question: str) -> AnswerResult: + return AnswerResult(f"stock:{question}", AnswerScores(1, 1, 1, "low"), {"mode": "stock"}) + + def _get_state(self, conversation_id: str | None) -> None: + del conversation_id + return None + + engine = MinimalEngine() + empty = asyncio.run(answer_workflow.run_answer(engine, " ", mode="custom")) + assert "need a question" in empty.reply + stock = asyncio.run(answer_workflow.run_answer(engine, "hello", mode="stock")) + assert stock.reply == "stock:hello" + + budget_engine = MinimalEngine() + budget_engine._settings = replace(settings, quick_time_budget_sec=0.1) + moments = iter([100.0, 101.0, 101.0]) + monkeypatch.setattr(answer_workflow, "time", SimpleNamespace(monotonic=lambda: next(moments))) + timed_out = asyncio.run(answer_workflow.run_answer(budget_engine, "cluster status", mode="custom")) + assert "ran out of time" in timed_out.reply + assert timed_out.meta["time_budget_hit"] is True + + +def test_run_answer_custom_orchestration_edges(monkeypatch: pytest.MonkeyPatch) -> None: + """Exercise run_answer retrieval, tool, subanswer, debug, and persistence branches.""" + + settings = replace(build_test_settings(), debug_pipeline=True) + summary = { + "nodes": {"total": 2, "ready": 1, "not_ready": 1}, + "hardware_by_node": {"titan-01": "rpi5"}, + "namespace_pods": [{"namespace": "synapse", "pods_total": 3}], + } + summary_lines = ["nodes_total: 2", "namespace_cpu_top: synapse=95", "pvc_usage_top: data=88"] + + class FakeSnapshot: + def get(self) -> dict[str, Any]: + return {"snapshot": True} + + class FakeKb: + def summary(self) -> str: + return "KB summary." + + def runbook_titles(self, limit: int = 6) -> str: + del limit + return "Relevant runbooks:\n- Fix (runbooks/fix.md)" + + def runbook_paths(self, limit: int = 10) -> list[str]: + del limit + return ["runbooks/fix.md"] + + def chunk_lines(self, max_files: int = 4, max_chars: int = 800) -> list[str]: + del max_files, max_chars + return ["KB File: ops.md", "namespace_cpu_top: synapse=95"] + + class PromptLLM: + async def chat(self, messages: list[dict[str, str]], *, model: str | None = None, timeout_sec: float | None = None) -> str: + del model, timeout_sec + prompt = messages[-1]["content"] + if "normalized (string), keywords" in prompt: + return json.dumps( + { + "normalized": "How many namespace pods running postgres connections pvc storage ready baseline cpu?", + "keywords": ["namespace", "pods", "postgres", "pvc", "ready", "baseline", "cpu"], + } + ) + if "needs_snapshot (bool)" in prompt: + return '{"needs_snapshot":true,"needs_kb":true,"needs_tool":true,"answer_style":"direct","follow_up":false,"question_type":"open_ended","focus_entity":"unknown","focus_metric":"unknown"}' + if "Generate up to" in prompt: + return '[{"question":"Which namespace pods are running?","priority":2},{"question":"What postgres connections are ready?","priority":1}]' + if "command" in prompt and "rationale" in prompt: + return '{"command":"kubectl top pods -n synapse","rationale":"check cpu"}' + if "Answer the sub-question using the context" in prompt: + return "subanswer" + return "{}" + + class WorkflowEngine: + def __init__(self) -> None: + self._settings = settings + self._snapshot = FakeSnapshot() + self._kb = FakeKb() + self._llm = PromptLLM() + self.stored = False + + def _get_state(self, conversation_id: str | None) -> None: + del conversation_id + return None + + def _store_state(self, conversation_id: str, claims: list[ClaimItem], summary_arg: dict[str, Any], snapshot: dict[str, Any], pin_snapshot: bool) -> None: + assert conversation_id == "conv" + assert claims and summary_arg and snapshot and pin_snapshot + self.stored = True + + plan = replace( + answer_common._mode_plan(settings, "custom"), + use_raw_snapshot=True, + use_deep_retrieval=True, + use_tool=True, + parallelism=1, + subanswer_retries=1, + ) + + async def fake_select_metric_chunks(*_args: Any, **_kwargs: Any) -> tuple[list[str], list[str]]: + return ["namespace_cpu_top"], ["c0"] + + async def fake_score_chunks(*_args: Any, **_kwargs: Any) -> list[dict[str, Any]]: + return [{"id": "c0", "score": 99, "reason": "match"}] + + async def fake_select_fact_lines(*_args: Any, **_kwargs: Any) -> list[str]: + return ["namespace_cpu_top: synapse=95"] + + async def fake_extract_fact_types(*_args: Any, **_kwargs: Any) -> list[str]: + return ["cpu"] + + async def fake_derive_signals(*_args: Any, **_kwargs: Any) -> list[str]: + return ["cpu"] + + async def fake_scan_chunk_for_signals(*_args: Any, **_kwargs: Any) -> list[str]: + return ["namespace_cpu_top: synapse=95"] + + async def fake_prune_metric_candidates(*_args: Any, **_kwargs: Any) -> list[str]: + return ["namespace_cpu_top: synapse=95"] + + async def fake_finalize_answer(**_kwargs: Any) -> tuple[str, AnswerScores, list[ClaimItem]]: + return ( + "final answer", + AnswerScores(90, 91, 92, "low"), + [ClaimItem(id="c1", claim="synapse high", evidence=[EvidenceItem(path="namespace_cpu_top", reason="test")])], + ) + + monkeypatch.setattr(answer_workflow, "_mode_plan", lambda _settings, _mode: plan) + monkeypatch.setattr(answer_workflow, "build_summary", lambda _snapshot: summary) + monkeypatch.setattr(answer_workflow, "_summary_lines", lambda _snapshot: summary_lines) + monkeypatch.setattr(answer_workflow, "_raw_snapshot_chunks", lambda _snapshot: [{"id": "raw", "text": "raw_fact: 1", "summary": "raw"}]) + monkeypatch.setattr(answer_workflow, "_spine_from_summary", lambda _summary: {}) + monkeypatch.setattr(answer_workflow, "route_intent", lambda _question: SimpleNamespace(kind="nodes_count")) + monkeypatch.setattr(answer_workflow, "_select_metric_chunks", fake_select_metric_chunks) + monkeypatch.setattr(answer_workflow, "_score_chunks", fake_score_chunks) + monkeypatch.setattr(answer_workflow, "_select_chunks", lambda _chunks, _scored, _plan, _tokens, _must: [{"id": "c0", "text": "namespace_cpu_top: synapse=95", "summary": "namespace cpu"}]) + monkeypatch.setattr(answer_workflow, "_collect_fact_candidates", lambda _selected, limit: ["namespace_cpu_top: synapse=95"]) + monkeypatch.setattr(answer_workflow, "_select_fact_lines", fake_select_fact_lines) + monkeypatch.setattr(answer_workflow, "_extract_fact_types", fake_extract_fact_types) + monkeypatch.setattr(answer_workflow, "_derive_signals", fake_derive_signals) + monkeypatch.setattr(answer_workflow, "_scan_chunk_for_signals", fake_scan_chunk_for_signals) + monkeypatch.setattr(answer_workflow, "_prune_metric_candidates", fake_prune_metric_candidates) + monkeypatch.setattr(answer_workflow, "finalize_answer", fake_finalize_answer) + + engine = WorkflowEngine() + observed: list[tuple[str, str]] = [] + result = asyncio.run( + answer_workflow.run_answer( + engine, + "Run limitless cluster status", + mode="custom", + observer=lambda stage, note: observed.append((stage, note)), + conversation_id="conv", + snapshot_pin=True, + ) + ) + + assert result.reply == "final answer" + assert result.meta["tool_hint"] == {"command": "kubectl top pods -n synapse", "rationale": "check cpu"} + assert engine.stored is True + stages = [stage for stage, _note in observed] + assert {"normalize", "route", "decompose", "retrieve", "tool", "subanswers", "synthesize"} <= set(stages) + + +def test_run_answer_factsheet_and_spine_shortcuts(monkeypatch: pytest.MonkeyPatch) -> None: + """Cover fact-sheet observer paths, falsey KB handling, and fast spine returns.""" + + settings = build_test_settings() + + class FactSnapshot: + def get(self) -> dict[str, Any]: + return {"snapshot": True} + + class FactKb: + def __init__(self, enabled: bool = True) -> None: + self.enabled = enabled + + def __bool__(self) -> bool: + return self.enabled + + def summary(self) -> str: + return "KB summary." + + def runbook_titles(self, limit: int = 6) -> str: + del limit + return "" + + def runbook_paths(self, limit: int = 10) -> list[str]: + del limit + return [] + + def chunk_lines(self, max_files: int = 4, max_chars: int = 800) -> list[str]: + del max_files, max_chars + return ["runbook: atlas"] + + class FactLLM: + async def chat(self, messages: list[dict[str, str]], *, model: str | None = None, timeout_sec: float | None = None) -> str: + del messages, model, timeout_sec + return "fact sheet reply" + + class FactEngine: + def __init__(self, kb: FactKb) -> None: + self._settings = settings + self._snapshot = FactSnapshot() + self._kb = kb + self._llm = FactLLM() + + def _get_state(self, conversation_id: str | None) -> None: + del conversation_id + return None + + monkeypatch.setattr(answer_workflow, "build_summary", lambda _snapshot: {"nodes": {"total": 2, "ready": 1, "not_ready": 1}}) + monkeypatch.setattr(answer_workflow, "_summary_lines", lambda _snapshot: ["nodes_total:2,ready=1,not_ready=1", "namespace_cpu_top: synapse=95"]) + + observed: list[tuple[str, str]] = [] + heuristic = asyncio.run( + answer_workflow.run_answer( + FactEngine(FactKb(True)), + "How many ready nodes are there?", + mode="smart", + observer=lambda stage, note: observed.append((stage, note)), + ) + ) + assert "1 ready nodes out of 2 total" in heuristic.reply + + fact_reply = asyncio.run( + answer_workflow.run_answer( + FactEngine(FactKb(False)), + "Give cluster health", + mode="smart", + observer=lambda stage, note: observed.append((stage, note)), + ) + ) + assert fact_reply.reply == "fact sheet reply" + assert ("factsheet", "building fact sheet") in observed + assert ("quick", "answering from fact sheet") in observed + + class SpineLLM: + async def chat(self, messages: list[dict[str, str]], *, model: str | None = None, timeout_sec: float | None = None) -> str: + del model, timeout_sec + prompt = messages[-1]["content"] + if "normalized (string), keywords" in prompt: + return '{"normalized":"How many nodes?","keywords":["nodes"]}' + if "needs_snapshot (bool)" in prompt: + return '{"needs_snapshot":true,"needs_kb":false,"needs_tool":false,"answer_style":"direct","follow_up":false,"question_type":"open_ended","focus_entity":"unknown","focus_metric":"unknown"}' + return "{}" + + spine_engine = FactEngine(FactKb(True)) + spine_engine._llm = SpineLLM() + monkeypatch.setattr(answer_workflow, "_spine_from_summary", lambda _summary: {}) + monkeypatch.setattr(answer_workflow, "route_intent", lambda _question: SimpleNamespace(kind="nodes_count")) + spine_reply = asyncio.run(answer_workflow.run_answer(spine_engine, "Run limitless how many nodes?", mode="fast")) + assert spine_reply.reply == "nodes_total:2,ready=1,not_ready=1" diff --git a/tests/test_support_modules.py b/tests/test_support_modules.py new file mode 100644 index 0000000..22a28b6 --- /dev/null +++ b/tests/test_support_modules.py @@ -0,0 +1,1424 @@ +"""Coverage-oriented tests for support modules and render helpers.""" + +from __future__ import annotations + +import asyncio +import importlib +import json +import logging as pylogging +from dataclasses import replace +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import httpx +import pytest +from fastapi.testclient import TestClient +from nats.js.errors import NotFoundError + +import atlasbot +import atlasbot.api +import atlasbot.engine +import atlasbot.engine.answerer +import atlasbot.knowledge +import atlasbot.llm +import atlasbot.matrix +import atlasbot.queue +import atlasbot.snapshot +from atlasbot.api.http import Api, AnswerRequest, _extract_question +from atlasbot.config import ( + MatrixBotConfig, + Settings, + _env_bool, + _env_float, + _env_int, + _load_matrix_bots, + load_settings, +) +import atlasbot.engine.answerer.common as answer_common +import atlasbot.engine.answerer.factsheet as answer_factsheet +import atlasbot.engine.answerer.post as answer_post +import atlasbot.engine.answerer.post_ext as answer_post_ext +import atlasbot.engine.answerer.retrieval as answer_retrieval +import atlasbot.engine.answerer.retrieval_ext as answer_retrieval_ext +import atlasbot.engine.answerer.spine as answer_spine +from atlasbot.engine.answerer import AnswerResult, AnswerScores +from atlasbot.engine.answerer._base import ClaimItem, ContradictionContext, EvidenceItem, InsightGuardInput, ScoreContext +from atlasbot.engine.intent_router import route_intent +from atlasbot.knowledge.loader import KnowledgeBase +from atlasbot.llm.client import LLMClient, build_messages, parse_json +from atlasbot.logging import JsonFormatter, configure_logging +from atlasbot.main import result_scores +from atlasbot.matrix.bot import MatrixBot, MatrixClient, _extract_mode, _mode_timeout_sec +from atlasbot.queue.nats import QueueManager +from atlasbot.snapshot.builder import core_a, core_b, format_a, format_b, format_c, summary_text +from atlasbot.state.store import ClaimStore, _safe_json +from testing.fakes import build_test_settings + + +def _rich_snapshot() -> dict[str, Any]: + return { + "collected_at": "2026-04-10T12:00:00Z", + "snapshot_version": "v1", + "summary": { + "signals": [ + {"scope": "node", "target": "titan-01", "metric": "cpu", "current": 90, "delta_pct": 15, "severity": "warn"} + ], + "profiles": { + "nodes": [ + {"node": "titan-01", "load_index": 0.9, "cpu": 90, "ram": 80, "pods_total": 10, "hardware": "rpi5"} + ], + "namespaces": [ + {"namespace": "synapse", "pods_total": 5, "cpu_usage": 40, "mem_usage": 50, "primary_node": "titan-01"} + ], + "workloads": [ + {"namespace": "synapse", "workload": "matrix", "pods_total": 3, "pods_running": 3, "primary_node": "titan-01"} + ], + }, + "inventory": {"workers": {"total": 2, "ready": 1}}, + "topology": { + "nodes": [{"name": "titan-01", "role": "worker"}], + "workloads": [{"name": "matrix", "node": "titan-01"}], + "namespaces": [{"name": "synapse", "pods": 5}], + }, + "lexicon": { + "terms": [{"term": "atlas", "meaning": "Atlas cluster"}], + "aliases": {"atlasbot": "atlas"}, + }, + "cross_stats": { + "node_metric_top": [ + { + "metric": "cpu", + "node": "titan-01", + "value": 90, + "cpu": 90, + "ram": 80, + "net": 2.5, + "io": 1.5, + "pods_total": 10, + } + ], + "namespace_metric_top": [ + { + "metric": "cpu", + "namespace": "synapse", + "value": 40, + "cpu": 40, + "ram": 50, + "net": 1.5, + "io": 1.0, + "pods_total": 5, + } + ], + "pvc_top": [{"metric": "usage", "namespace": "synapse", "pvc": "data", "value": 95}], + }, + "baseline_deltas": { + "nodes": { + "cpu": [{"node": "titan-01", "delta": 10, "severity": "warn"}], + "ram": [{"node": "titan-01", "delta": 5}], + }, + "namespaces": { + "pods": [{"namespace": "synapse", "delta": 8, "severity": "high"}], + }, + }, + "pod_issue_summary": { + "waiting_reasons_top": [{"reason": "ImagePullBackOff", "count": 3}], + "phase_reasons_top": [{"reason": "Pending", "count": 2}], + "namespace_issue_top": {"waiting": [{"namespace": "synapse", "value": 2}]}, + }, + "trend_requests": {}, + "pod_waiting_trends": {}, + "pod_terminated_trends": {}, + }, + "nodes_summary": { + "total": 2, + "ready": 1, + "not_ready": 1, + "not_ready_names": ["titan-02"], + "by_arch": {"rpi5": 1, "amd64": 1}, + "by_role": {"worker": 2}, + "workers": {"total": 2, "ready": 1}, + "pressure_nodes": {"names": ["titan-02"]}, + }, + "nodes_detail": [ + { + "name": "titan-01", + "hardware": "rpi5", + "arch": "arm64", + "os": "linux", + "kubelet": "1.30", + "kernel": "6.8", + "container_runtime": "containerd", + "is_worker": True, + "roles": ["worker"], + "age_hours": 12, + "taints": [{"key": "dedicated", "effect": "NoSchedule"}], + }, + { + "name": "titan-02", + "hardware": "amd64", + "arch": "amd64", + "os": "linux", + "kubelet": "1.30", + "kernel": "6.8", + "container_runtime": "containerd", + "is_worker": True, + "roles": ["worker"], + "age_hours": 24, + "taints": [{"key": "pressure", "effect": "NoExecute"}], + }, + ], + "metrics": { + "node_load": [ + {"node": "titan-01", "load_index": 0.9, "cpu": 90, "ram": 80, "net": 100, "io": 50}, + {"node": "titan-02", "load_index": 0.4, "cpu": 30, "ram": 20, "net": 10, "io": 5}, + ], + "pods_running": 12, + "pods_pending": 1, + "pods_failed": 2, + "pods_succeeded": 3, + "capacity_cpu": 8, + "allocatable_cpu": 7, + "capacity_mem_bytes": 8 * 1024 * 1024 * 1024, + "allocatable_mem_bytes": 6 * 1024 * 1024 * 1024, + "capacity_pods": 110, + "allocatable_pods": 100, + "namespace_cpu_top": [{"metric": {"namespace": "synapse"}, "value": 95}], + "namespace_mem_top": [{"metric": {"namespace": "synapse"}, "value": 1024 * 1024}], + "namespace_cpu_requests_top": [{"metric": {"namespace": "synapse"}, "value": 50}], + "namespace_mem_requests_top": [{"metric": {"namespace": "synapse"}, "value": 2 * 1024 * 1024}], + "namespace_net_top": [{"metric": {"namespace": "synapse"}, "value": 1024}], + "namespace_io_top": [{"metric": {"namespace": "synapse"}, "value": 2048}], + "pod_cpu_top": [{"metric": {"namespace": "synapse", "pod": "matrix"}, "value": 3.3}], + "pod_cpu_top_node": [{"metric": {"namespace": "synapse", "pod": "matrix", "node": "titan-01"}, "value": 3.3}], + "pod_mem_top": [{"metric": {"namespace": "synapse", "pod": "matrix"}, "value": 4096}], + "pod_mem_top_node": [{"metric": {"namespace": "synapse", "pod": "matrix", "node": "titan-01"}, "value": 4096}], + "top_restarts_1h": [{"metric": {"namespace": "synapse", "pod": "matrix"}, "value": [0, 4]}], + "restart_namespace_top": [{"metric": {"namespace": "synapse"}, "value": 4}], + "job_failures_24h": [{"metric": {"namespace": "synapse", "job_name": "backup"}, "value": 2}], + "node_pods_top": [{"node": "titan-01", "pods_total": 5, "namespaces": [{"name": "synapse", "count": 3}]}], + "postgres_connections": {"used": 5, "max": 10, "hottest_db": {"label": "synapse", "value": 3}}, + "node_usage": { + "cpu": [{"node": "titan-01", "value": 90}], + "ram": [{"node": "titan-02", "value": 70}], + "net": [{"node": "titan-02", "value": 2}], + "io": [{"node": "titan-01", "value": 0.5}], + "disk": [{"node": "titan-01", "value": 80}], + }, + "node_load_summary": { + "top": [{"node": "titan-01", "load_index": 0.9, "cpu": 90, "ram": 80, "io": 1.5, "net": 2.5, "pods_total": 10}], + "outliers": [{"node": "titan-02"}], + }, + "hardware_usage_avg": [ + {"hardware": "rpi5", "load_index": 0.9, "cpu": 90, "ram": 80, "io": 1.5, "net": 2.5}, + ], + "namespace_capacity_summary": { + "cpu_ratio_top": [ + {"namespace": "synapse", "cpu_usage_ratio": 0.8, "cpu_usage": 40, "cpu_requests": 50} + ], + "mem_ratio_top": [ + {"namespace": "synapse", "mem_usage_ratio": 0.7, "mem_usage": 70, "mem_requests": 100} + ], + "cpu_headroom_low": [{"namespace": "synapse", "headroom": 0.2}], + "mem_headroom_low": [{"namespace": "synapse", "headroom": 0.3}], + "cpu_overcommitted": 1, + "mem_overcommitted": 0, + "cpu_overcommitted_names": ["synapse"], + "mem_overcommitted_names": [], + }, + "namespace_capacity": [{"namespace": "synapse", "cpu": 1, "mem": 2}], + "units": {"cpu_pct": "%", "ram_pct": "%", "net": "bytes/s"}, + "windows": {"rates": "5m", "restarts": "1h"}, + }, + "namespace_pods": [{"namespace": "synapse", "pods_total": 5, "pods_running": 4}], + "namespace_nodes": [{"namespace": "synapse", "pods_total": 5, "primary_node": "titan-01"}], + "node_pods": [{"node": "titan-01", "pods_total": 5, "namespaces": [{"name": "synapse", "count": 3}]}], + "pod_issues": { + "counts": {"Failed": 2, "Pending": 1, "Unknown": 0}, + "top": [{"namespace": "synapse", "pod": "matrix", "phase": "Pending", "age_hours": 2}], + "pending_oldest": [{"namespace": "synapse", "pod": "matrix", "age_hours": 2}], + "waiting_reasons_top": [{"reason": "ImagePullBackOff", "count": 3}], + "pending_over_15m": 1, + "waiting_reasons": {"ImagePullBackOff": 3}, + }, + "workloads_health": { + "deployments": {"ready": 2, "not_ready": 1, "desired": 3}, + "statefulsets": {"ready": 1, "not_ready": 0, "desired": 1}, + "daemonsets": {"ready": 1, "not_ready": 0, "desired": 1}, + }, + "events": { + "warnings_top_reason": {"ImagePullBackOff": 3}, + "warnings_latest": [{"reason": "FailedScheduling", "count": 2}], + "warnings_total": 5, + }, + "jobs": { + "totals": {"total": 4, "active": 1, "failed": 1, "succeeded": 2}, + "failing": [{"namespace": "synapse", "job_name": "backup", "failed": 1}], + "active_oldest": [{"namespace": "synapse", "job_name": "backup", "age_minutes": 30}], + }, + "postgres": { + "used": 5, + "max": 10, + "hottest_db": {"label": "synapse", "value": 3}, + "by_db": [{"label": "synapse", "value": 3}], + }, + "hottest": { + "cpu": {"node": "titan-01", "value": 90}, + "ram": {"node": "titan-02", "value": 70}, + "net": {"node": "titan-02", "value": 2}, + "io": {"node": "titan-01", "value": 0.5}, + "disk": {"node": "titan-01", "value": 80}, + }, + "pvc_usage_top": [{"namespace": "synapse", "pvc": "data", "value": 95}], + "root_disk_low_headroom": [{"node": "titan-01", "headroom_pct": 20, "used_pct": 80}], + "longhorn": { + "total": 2, + "attached_count": 1, + "detached_count": 1, + "degraded_count": 0, + "by_state": {"attached": 1, "detached": 1}, + "by_robustness": {"healthy": 1, "degraded": 1}, + "unhealthy": [{"name": "vol1", "state": "detached", "robustness": "degraded"}], + }, + "workloads": [{"namespace": "synapse", "name": "matrix", "pods_total": 3, "pods_running": 3}], + "flux": { + "ready": 1, + "not_ready": 1, + "items": [{"kind": "HelmRelease", "name": "matrix", "status": "Ready"}], + }, + } + + +def test_package_imports() -> None: + """Import package shims so their `__init__` modules stay covered.""" + + importlib.import_module("atlasbot") + importlib.import_module("atlasbot.api") + importlib.import_module("atlasbot.engine") + importlib.import_module("atlasbot.engine.answerer") + importlib.import_module("atlasbot.knowledge") + importlib.import_module("atlasbot.llm") + importlib.import_module("atlasbot.matrix") + importlib.import_module("atlasbot.queue") + importlib.import_module("atlasbot.snapshot") + assert atlasbot.snapshot.__name__ == "atlasbot.snapshot" + + +def test_config_helpers_and_load_settings(monkeypatch: pytest.MonkeyPatch) -> None: + """Exercise config parsing branches and matrix bot loading.""" + + monkeypatch.setenv("BOOL_ONE", "yes") + monkeypatch.setenv("INT_BAD", "nope") + monkeypatch.setenv("FLOAT_BAD", "nope") + assert _env_bool("BOOL_ONE") + assert _env_int("INT_BAD", "7") == 7 + assert _env_float("FLOAT_BAD", "2.5") == 2.5 + monkeypatch.setenv("BOT_USER_QUICK", "quick") + monkeypatch.setenv("BOT_PASS_QUICK", "pw") + monkeypatch.setenv("BOT_USER_SMART", "smart") + monkeypatch.setenv("BOT_PASS_SMART", "pw") + settings = load_settings() + assert settings.matrix_bots[0].mode == "quick" + assert settings.matrix_bots[1].mode == "smart" + monkeypatch.delenv("BOT_USER_QUICK", raising=False) + monkeypatch.delenv("BOT_PASS_QUICK", raising=False) + monkeypatch.delenv("BOT_USER_SMART", raising=False) + monkeypatch.delenv("BOT_PASS_SMART", raising=False) + monkeypatch.setenv("BOT_USER", "atlasbot") + monkeypatch.setenv("BOT_PASS", "legacy") + legacy = _load_matrix_bots(("atlasbot",)) + assert legacy and legacy[0].mode == "" + + +def test_knowledge_base_helpers(tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None: + """Read KB data, titles, paths, and prompt chunks from a temp catalog.""" + + base = tmp_path / "kb" + catalog = base / "catalog" + catalog.mkdir(parents=True) + (catalog / "atlas.json").write_text( + json.dumps({"cluster": "titan", "sources": [{"name": "docs"}], "extra": True}), + encoding="utf-8", + ) + (catalog / "runbooks.json").write_text(json.dumps([{"title": "Fix", "path": "runbooks/fix.md"}]), encoding="utf-8") + (base / "notes.md").write_text("hello atlas", encoding="utf-8") + kb = KnowledgeBase(str(base)) + assert "Cluster: titan." in kb.summary() + assert "Relevant runbooks" in kb.runbook_titles(limit=1) + assert kb.runbook_paths() == ["runbooks/fix.md"] + assert kb.chunk_lines(max_files=1, max_chars=200) + bad = base / "bad" + bad.mkdir() + (bad / "catalog").mkdir() + (bad / "catalog" / "atlas.json").write_text("{broken", encoding="utf-8") + broken = KnowledgeBase(str(bad)) + with caplog.at_level(pylogging.WARNING): + assert broken.summary() == "" + + +def test_llm_client_helpers_and_fallback(monkeypatch: pytest.MonkeyPatch) -> None: + """Exercise message building, JSON parsing, and fallback model logic.""" + + settings = replace( + build_test_settings(), + ollama_url="http://example", + ollama_model="base", + ollama_fallback_model="fallback", + ollama_retries=1, + ) + client = LLMClient(settings) + assert client._endpoint().endswith("/api/chat") + assert build_messages("sys", "prompt", context="ctx")[1]["content"].startswith("Context") + assert parse_json("{\"ok\": true}", fallback={}) == {"ok": True} + + class FakeResponse: + def __init__(self, status_code: int, payload: dict[str, Any]): + self.status_code = status_code + self._payload = payload + + def raise_for_status(self) -> None: + if self.status_code >= 400: + raise httpx.HTTPStatusError("bad", request=httpx.Request("POST", "http://example"), response=httpx.Response(self.status_code)) + + def json(self) -> dict[str, Any]: + return self._payload + + class FakeAsyncClient: + def __init__(self, timeout: float | None = None): + self.timeout = timeout + + async def __aenter__(self) -> FakeAsyncClient: + return self + + async def __aexit__(self, *exc: object) -> None: + return None + + async def post( + self, + _url: str, + *, + json: dict[str, Any], + headers: dict[str, str], + ) -> FakeResponse: + model = json["model"] + assert headers["Content-Type"] == "application/json" + if model == "base": + return FakeResponse(404, {}) + return FakeResponse(200, {"message": {"content": "hello"}}) + + monkeypatch.setattr(httpx, "AsyncClient", FakeAsyncClient) + reply = asyncio.run(client.chat([{"role": "user", "content": "hi"}], model=None, timeout_sec=1.0)) + assert reply == "hello" + + +def test_logging_formatter_and_configure() -> None: + """Format a structured record and install JSON logging on the root logger.""" + + formatter = JsonFormatter() + record = pylogging.LogRecord("atlasbot", pylogging.INFO, __file__, 1, "hello %s", ("world",), None) + record.extra = {"mode": "quick"} + payload = json.loads(formatter.format(record)) + assert payload["message"] == "hello world" + assert payload["mode"] == "quick" + configure_logging("debug") + root = pylogging.getLogger() + assert root.handlers and isinstance(root.handlers[0].formatter, JsonFormatter) + + +def test_state_store_roundtrip_and_cleanup(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + """Persist, read, and expire a claim payload.""" + + path = tmp_path / "state.db" + store = ClaimStore(str(path), 60) + store.set( + "conv", + { + "snapshot_id": "snap-1", + "claims": [{"id": "c1"}], + "snapshot": {"nodes": 1}, + }, + ) + payload = store.get("conv") + assert payload and payload["snapshot_id"] == "snap-1" + assert payload["claims"] == [{"id": "c1"}] + assert _safe_json("{broken", []) == [] + monkeypatch.setattr("atlasbot.state.store.time.monotonic", lambda: 1_000_000.0) + store.cleanup() + assert store.get("conv") is None + + +@pytest.mark.parametrize( + ("question", "kind"), + [ + ("How many nodes are ready?", "nodes_ready"), + ("How many cluster nodes do we have?", "nodes_count"), + ("Which nodes are not rpi?", "nodes_non_rpi"), + ("What hardware mix do we have?", "hardware_mix"), + ("What is the hottest cpu?", "hottest_cpu"), + ("What is the hottest ram?", "hottest_ram"), + ("How many postgres connections?", "postgres_connections"), + ("Which postgres db is hottest?", "postgres_hottest"), + ("Which namespace has most pods?", "namespace_most_pods"), + ("Is there pressure on the nodes?", "pressure_summary"), + ], +) +def test_intent_router_patterns(question: str, kind: str) -> None: + """Route the main cluster intents into deterministic matches.""" + + match = route_intent(question) + assert match and match.kind == kind + + +def test_api_routes_and_auth() -> None: + """Exercise the HTTP wrapper, token check, and question extraction.""" + + settings = replace(build_test_settings(), internal_token="secret") + + async def handler( + question: str, + mode: str, + _history: list[dict[str, str]] | None, + _conversation_id: str | None, + _snapshot_pin: bool | None, + ) -> AnswerResult: + return AnswerResult( + reply=f"{question}:{mode}", + scores=AnswerScores(confidence=1, relevance=2, satisfaction=3, hallucination_risk="low"), + meta={"mode": mode}, + ) + + api = Api(settings, handler) + client = TestClient(api.app) + assert client.get("/healthz").json() == {"ok": True} + assert client.post("/v1/answer", json={"question": "hi"}).status_code == 401 + assert _extract_question(AnswerRequest(prompt=" hello ")).strip() == "hello" + response = client.post( + "/v1/answer", + headers={"X-Internal-Token": "secret"}, + json={"prompt": "hello", "mode": "SMART", "conversation_id": "conv-1", "snapshot_pin": True}, + ) + assert response.status_code == 200 + assert response.json()["reply"] == "hello:smart" + + +def test_main_and_queue_and_matrix(monkeypatch: pytest.MonkeyPatch) -> None: + """Run the bootstrap path and queueing branch without external services.""" + + from atlasbot import main as main_mod + + settings = replace( + build_test_settings(), + queue_enabled=True, + matrix_bots=(MatrixBotConfig("bot", "pw", ("bot",), "quick"),), + ) + + class FakeQueue: + def __init__(self, settings: Settings, handler): + self.settings = settings + self.handler = handler + self.started = False + + async def start(self) -> None: + self.started = True + + async def submit(self, _payload: dict[str, Any]) -> dict[str, Any]: + return { + "reply": "queued", + "scores": {"confidence": 7, "relevance": 8, "satisfaction": 9, "hallucination_risk": "low"}, + } + + class FakeMatrixBot: + def __init__(self, _settings: Settings, _bot: MatrixBotConfig, _engine: Any, answer_handler): + self.answer_handler = answer_handler + + async def run(self) -> None: + result = await self.answer_handler("what is atlas?", "quick", [], "room-1", None) + assert result.reply == "queued" + + class FakeServer: + def __init__(self, config: Any): + self.config = config + + async def serve(self) -> None: + return None + + monkeypatch.setattr(main_mod, "load_settings", lambda: settings) + monkeypatch.setattr(main_mod, "configure_logging", lambda _level: None) + monkeypatch.setattr(main_mod, "QueueManager", FakeQueue) + monkeypatch.setattr(main_mod, "MatrixBot", FakeMatrixBot) + monkeypatch.setattr(main_mod.uvicorn, "Server", FakeServer) + asyncio.run(main_mod.main()) + scores = result_scores({"scores": {"confidence": 10, "relevance": 20, "satisfaction": 30, "hallucination_risk": "low"}}) + assert scores.confidence == 10 + + +def test_matrix_and_queue_and_snapshot_helpers(monkeypatch: pytest.MonkeyPatch) -> None: + """Drive the Matrix client, queue manager, and snapshot renderers.""" + + settings = replace(build_test_settings(), matrix_bots=()) + bot_cfg = MatrixBotConfig("bot", "pw", ("bot",), "quick") + + class FakeResp: + def __init__(self, payload: dict[str, Any], status_code: int = 200): + self._payload = payload + self.status_code = status_code + + def raise_for_status(self) -> None: + if self.status_code >= 400: + raise httpx.HTTPError("bad") + + def json(self) -> dict[str, Any]: + return self._payload + + class FakeAsyncClient: + def __init__(self, timeout: float | None = None): + self.timeout = timeout + + async def __aenter__(self) -> "FakeAsyncClient": + return self + + async def __aexit__(self, *exc: object) -> None: + return None + + async def post(self, url: str, json: dict[str, Any] | None = None, headers: dict[str, str] | None = None) -> FakeResp: + if "login" in url: + return FakeResp({"access_token": "tok"}) + return FakeResp({}) + + async def get(self, url: str, headers: dict[str, str] | None = None, params: dict[str, Any] | None = None) -> FakeResp: + if "directory/room" in url: + return FakeResp({"room_id": "!room"}) + return FakeResp({"next_batch": "n1", "rooms": {"join": {}}}) + + monkeypatch.setattr("atlasbot.matrix.bot.httpx.AsyncClient", FakeAsyncClient) + client = MatrixClient(settings, bot_cfg) + token = asyncio.run(client.login()) + assert token == "tok" + assert asyncio.run(client.resolve_room(token)) == "!room" + asyncio.run(client.join_room(token, "!room")) + asyncio.run(client.send_message(token, "!room", "hello")) + assert asyncio.run(client.sync(token, None))["next_batch"] == "n1" + mode, cleaned = _extract_mode("atlas-smart hello", ("atlas",), "") + assert mode == "smart" + assert cleaned == "-smart hello" + assert _mode_timeout_sec(settings, "smart") == settings.smart_time_budget_sec + + class FakeSub: + async def next_msg(self, timeout: float) -> Any: + return SimpleNamespace(data=json.dumps({"reply": "ok"}).encode(), reply="reply") + + async def unsubscribe(self) -> None: + return None + + class FakeMsg: + def __init__(self) -> None: + self.data = json.dumps({"payload": {"question": "q"}}).encode() + self.reply = "reply" + self.acked = False + + async def ack(self) -> None: + self.acked = True + + class FakeJS: + def __init__(self) -> None: + self.streams = [] + + async def stream_info(self, stream: str) -> None: + raise NotFoundError + + async def add_stream(self, **kwargs: Any) -> None: + self.streams.append(kwargs) + + async def publish(self, subject: str, data: bytes) -> None: + self.streams.append({"subject": subject, "data": data}) + + async def pull_subscribe(self, subject: str, durable: str) -> Any: + class Pull: + async def fetch(self, count: int, timeout: float) -> list[FakeMsg]: + raise RuntimeError("stop") + + return Pull() + + class FakeNATS: + def __init__(self) -> None: + self.published = [] + + async def connect(self, url: str) -> None: + return None + + def jetstream(self) -> FakeJS: + return FakeJS() + + def new_inbox(self) -> str: + return "inbox" + + async def subscribe(self, reply: str) -> FakeSub: + return FakeSub() + + async def publish(self, reply: str, data: bytes) -> None: + self.published.append((reply, data)) + + async def drain(self) -> None: + return None + + monkeypatch.setattr("atlasbot.queue.nats.NATS", FakeNATS) + queue_settings = replace(settings, queue_enabled=True, nats_stream="atlasbot", nats_subject="atlasbot.requests") + qm = QueueManager(queue_settings, lambda payload: asyncio.sleep(0, result={"reply": "x"})) + asyncio.run(QueueManager(replace(queue_settings, queue_enabled=False), lambda payload: asyncio.sleep(0, result=payload)).start()) + asyncio.run(qm.start()) + assert asyncio.run(qm.submit({"mode": "quick"})) == {"reply": "ok"} + assert asyncio.run(qm.submit({"mode": "genius"})) == {"reply": "ok"} + + class LoopPull: + def __init__(self) -> None: + self.calls = 0 + + async def fetch(self, count: int, timeout: float) -> list[FakeMsg]: + del count, timeout + self.calls += 1 + if self.calls == 1: + raise RuntimeError("retry") + if self.calls == 2: + return [FakeMsg()] + raise asyncio.CancelledError + + class LoopJS: + async def pull_subscribe(self, subject: str, durable: str) -> LoopPull: + del subject, durable + return LoopPull() + + qm._js = LoopJS() + with pytest.raises(asyncio.CancelledError): + asyncio.run(qm._worker_loop()) + asyncio.run(qm.stop()) + + snapshot = _rich_snapshot() + summary = core_a.build_summary(snapshot) + assert summary["nodes"]["total"] == 2 + text = summary_text(snapshot) + assert "atlas_cluster:" in text + assert "hardware_usage_avg:" in text + assert "signals:" in text + assert "node_profiles:" in text + assert "flux:" in text or "flux" in text + + lines: list[str] = [] + format_a._append_nodes(lines, summary) + format_a._append_hardware(lines, summary) + format_a._append_hardware_groups(lines, summary) + format_a._append_node_ages(lines, summary) + format_a._append_node_taints(lines, summary) + format_a._append_node_facts(lines, summary) + format_a._append_pressure(lines, summary) + format_a._append_pods(lines, summary) + format_a._append_capacity(lines, summary) + format_a._append_namespace_pods(lines, summary) + format_a._append_namespace_nodes(lines, summary) + format_a._append_node_pods(lines, summary) + format_a._append_pod_issues(lines, summary) + format_a._append_workload_health(lines, summary) + format_a._append_node_usage_stats(lines, summary) + format_a._append_events(lines, summary) + format_a._append_pvc_usage(lines, summary) + format_a._append_root_disk_headroom(lines, summary) + format_b._append_longhorn(lines, summary) + format_b._append_namespace_usage(lines, summary) + format_b._append_namespace_requests(lines, summary) + format_b._append_namespace_io_net(lines, summary) + format_b._append_pod_usage(lines, summary) + format_b._append_restarts(lines, summary) + format_b._append_job_failures(lines, summary) + format_b._append_jobs(lines, summary) + format_b._append_postgres(lines, summary) + format_b._append_hottest(lines, summary) + format_b._append_workloads(lines, summary) + format_b._append_topology(lines, summary) + format_b._append_flux(lines, summary) + format_c._append_signals(lines, summary) + format_c._append_profiles(lines, summary) + format_c._append_units_windows(lines, summary) + format_c._append_node_load_summary(lines, summary) + format_c._append_hardware_usage(lines, summary) + format_c._append_cluster_watchlist(lines, summary) + format_c._append_baseline_deltas(lines, summary) + format_c._append_pod_issue_summary(lines, summary) + format_c._append_workloads_by_namespace(lines, summary) + format_c._append_lexicon(lines, summary) + format_c._append_cross_stats(lines, summary) + assert any(line.startswith("nodes:") for line in lines) + assert any(line.startswith("longhorn:") for line in lines) + assert any(line.startswith("signals:") for line in lines) + + core_b_summary = core_b._build_hottest(snapshot["metrics"]) + assert core_b_summary["hottest"]["cpu"]["node"] == "titan-01" + + +def test_matrix_bot_sync_and_heartbeat() -> None: + """Drive the Matrix bot heartbeat and sync handlers with a fake client.""" + + settings = replace(build_test_settings(), thinking_interval_sec=0.001) + bot_cfg = MatrixBotConfig("bot", "pw", ("atlas",), "quick") + + class FakeClient: + def __init__(self) -> None: + self.sent: list[str] = [] + + async def login(self) -> str: + return "tok" + + async def resolve_room(self, token: str) -> str: + return "!room" + + async def join_room(self, token: str, room_id: str) -> None: + return None + + async def send_message(self, token: str, room_id: str, text: str) -> None: + self.sent.append(text) + + async def sync(self, token: str, since: str | None) -> dict[str, Any]: + return { + "next_batch": "n1", + "rooms": { + "join": { + "!room": { + "timeline": { + "events": [ + {"type": "m.room.message", "sender": "user", "content": {"body": "atlas quick what is atlas?"}}, + {"type": "m.room.message", "sender": "bot", "content": {"body": "ignored"}}, + ] + } + } + } + }, + } + + async def answer_handler(question: str, mode: str, history, conversation_id, observer): + if observer: + observer("stage", "working") + return AnswerResult( + reply="Atlas has 22 nodes", + scores=AnswerScores(confidence=1, relevance=2, satisfaction=3, hallucination_risk="low"), + meta={"mode": mode}, + ) + + bot = MatrixBot(settings, bot_cfg, SimpleNamespace(answer=lambda *args, **kwargs: None), answer_handler) + bot._client = FakeClient() + asyncio.run(bot._answer_with_heartbeat("tok", "!room", "What is Atlas?", "quick")) + payload = { + "rooms": { + "join": { + "!room": { + "timeline": { + "events": [ + {"type": "m.room.message", "sender": "user", "content": {"body": "atlas smart hello"}} + ] + } + } + } + } + } + asyncio.run(bot._handle_sync("tok", payload)) + assert bot._client.sent + + +def test_answerer_helper_coverage_smoke() -> None: + """Exercise the split answerer helpers with representative inputs.""" + + settings = build_test_settings() + plan = answer_common._mode_plan(settings, "smart") + fast_plan = replace(plan, parallelism=2, score_retries=2, chunk_group=1, chunk_top=2, max_subquestions=2) + snapshot = _rich_snapshot() + summary = core_a.build_summary(snapshot) + summary_lines = answer_spine._summary_lines(snapshot) + rich_lines = [ + "nodes_total: 2", + "nodes_ready: 1", + "cluster_name: atlas", + "pods_total: 3", + "cpu: 90", + "ram: 80", + "runbooks/fix.md", + ] + + class ScriptedLLM: + async def __call__( + self, + _system: str, + _prompt: str, + *, + context: str | None = None, + model: str | None = None, + tag: str = "", + ) -> str: + responses = { + "chunk_score": '[{"id":"c1","score":1},{"id":"c2","score":2}]', + "chunk_select": '{"selected_index": 1}', + "metric_keys": '{"keys":["nodes_total","pods_total"]}', + "metric_keys_validate": '{"missing":["pods_total"]}', + "fact_types": '{"fact_types":["nodes_total","pods_total"]}', + "fact_types_select": '{"best": 1}', + "signals": '{"signals":["cpu","ram"]}', + "signals_select": '{"best": 1}', + "chunk_scan": '{"lines":["cpu: 90"]}', + "chunk_scan_select": '{"best": 1}', + "fact_prune": '{"lines":["cpu: 90"]}', + "fact_prune_select": '{"best": 1}', + "fact_select": '{"lines":["cpu: 90"]}', + "fact_select_best": '{"best": 1}', + "contradiction": '{"use_facts": false, "confidence": 99}', + "insight_guard": '{"ok": false}', + "insight_fix": "fixed insight", + } + return responses.get(tag, "{}") + + scripted_llm = ScriptedLLM() + chunks = [ + {"id": "c1", "text": "nodes_total: 2\npods_total: 3", "summary": "nodes"}, + {"id": "c2", "text": "cpu: 90\nram: 80", "summary": "cpu"}, + ] + groups = answer_common._build_chunk_groups(chunks, 1) + scores = asyncio.run(answer_common._score_chunks(scripted_llm, chunks, "How many nodes?", ["nodes"], fast_plan)) + serial_ctx = ScoreContext(question="How many nodes?", sub_questions=["nodes"], retries=2, parallelism=1, select_best=True, fast_model="fast") + serial_scores = asyncio.run(answer_common._score_groups_serial(scripted_llm, groups, serial_ctx)) + parallel_ctx = ScoreContext(question="How many nodes?", sub_questions=["nodes"], retries=2, parallelism=2, select_best=True, fast_model="fast") + parallel_scores = asyncio.run(answer_common._score_groups_parallel(scripted_llm, groups, parallel_ctx)) + best_run = asyncio.run(answer_common._select_best_score_run(scripted_llm, groups[0], [{"c1": 1.0}, {"c1": 2.0}], serial_ctx)) + selected = answer_common._select_chunks(chunks, {"c1": 0.2, "c2": 0.9}, replace(fast_plan, chunk_top=2), ["cpu"], ["c2"]) + assert scores and serial_scores and parallel_scores and best_run and selected + assert answer_common._strip_followup_meta("The draft is correct. Atlas is healthy.") == "Atlas is healthy." + assert answer_common._llm_call_limit(settings, "smart") == settings.smart_llm_calls_max + assert answer_common._mode_time_budget(settings, "quick") == settings.quick_time_budget_sec + assert answer_common._select_subquestions([], "fallback", 2) == ["fallback"] + assert answer_common._chunk_lines(["a", "b", "c"], 2) + assert answer_common._raw_snapshot_chunks(snapshot) + assert answer_common._format_runbooks(["runbooks/fix.md"]) + assert answer_common._keyword_hits([{"text": "cpu usage"}], {"text": "cpu usage"}, ["cpu"]) + assert answer_factsheet._factsheet_kb_chars("quick", 10) + assert answer_factsheet._factsheet_line_limit("smart") >= 1 + assert answer_factsheet._factsheet_instruction("quick") + assert answer_factsheet._factsheet_model("genius", fast_plan) == fast_plan.model + assert answer_factsheet._is_plain_math_question("2+2") + assert answer_factsheet._quick_fact_sheet_lines("How many nodes?", rich_lines, ["kb"], limit=4) + assert answer_factsheet._quick_fact_sheet_text(["nodes_total: 2"]) + assert answer_factsheet._quick_fact_sheet_heuristic_answer("How many ready nodes?", ["nodes_total:2,ready:1,not_ready:0"]) + assert answer_factsheet._json_excerpt(summary) + assert answer_post._strip_unknown_entities("node titan-99 is hot. Atlas is healthy.", ["titan-99"], []) == "Atlas is healthy." + assert answer_post._needs_evidence_guard("node titan-99 is hot.", ["node titan-01"]) is True + contradiction = asyncio.run( + answer_post._contradiction_decision( + ContradictionContext(scripted_llm, "why", "draft", ["fact"], fast_plan), + attempts=2, + ) + ) + assert contradiction["confidence"] == 99 + assert answer_post._format_direct_metric_line("nodes_total: 2") + assert answer_post._global_facts(["nodes_total: 2", "other: 1"]) + assert answer_post._has_keyword_overlap(["cpu usage"], ["cpu"]) + assert answer_post._merge_tokens(["a"], ["b"], ["c"]) == ["a", "b", "c"] + assert answer_post._extract_question_tokens("How many nodes?") + assert answer_post._expand_tokens(["nodes_total"]) + assert answer_post._ensure_token_coverage(["nodes_total: 2"], ["pods"], ["pods_total: 3"], max_add=1) + assert answer_post._best_keyword_line(["cpu: 90"], ["cpu"]) == "cpu: 90" + assert answer_post._line_starting_with(["cpu: 90"], "cpu") + assert answer_post._non_rpi_nodes({"hardware_by_node": {"titan-01": "rpi5", "titan-02": "amd64"}}) == {"amd64": ["titan-02"]} + assert answer_post._format_hardware_groups({"amd64": ["titan-02"]}, "Nodes") + assert answer_post._lexicon_context({"lexicon": {"terms": [{"term": "atlas", "meaning": "cluster"}], "aliases": {"bot": "atlas"}}}) + assert answer_post._parse_json_block("{\"ok\": true}", fallback={}) == {"ok": True} + assert answer_post._parse_json_list("[{\"ok\": true}]") == [{"ok": True}] + assert answer_post._scores_from_json({"confidence": "1", "relevance": 2, "satisfaction": 3, "hallucination_risk": "low"}).confidence == 1 + assert answer_post._coerce_int("4", 1) == 4 + assert answer_post._default_scores().hallucination_risk == "medium" + assert answer_post._style_hint({"answer_style": "insightful"}) == "insightful" + assert answer_post._needs_evidence_fix("we don't know", {"needs_snapshot": True}) is True + assert answer_post._should_use_insight_guard({"answer_style": "insightful"}) + insight_inputs = InsightGuardInput( + question="why", + reply="Insightful reply", + classify={"answer_style": "insightful", "question_type": "open_ended"}, + context="", + plan=fast_plan, + call_llm=scripted_llm, + facts=["fact"], + ) + assert asyncio.run(answer_post._apply_insight_guard(insight_inputs)) + assert answer_post_ext._reply_matches_metric_facts("nodes_total: 2", ["nodes_total: 2"]) + assert answer_post_ext._needs_dedup("one. one. one.") + answer_post_ext._needs_focus_fix("how many nodes", "For more details. Additional context.", {"question_type": "metric"}) + assert answer_post_ext._extract_keywords("How many nodes?", "How many nodes?", ["pods"], ["nodes"]) + assert answer_post_ext._allowed_nodes(summary) + assert answer_post_ext._allowed_namespaces(summary) + assert answer_post_ext._find_unknown_nodes("node titan-99", ["titan-01"]) == ["titan-99"] + assert answer_post_ext._find_unknown_namespaces("namespace rogue", ["synapse"]) == ["rogue"] + assert answer_post_ext._needs_runbook_fix("see runbooks/bad.md", ["runbooks/fix.md"]) + assert answer_post_ext._needs_runbook_reference("where is the runbook", ["runbooks/fix.md"], "") + assert answer_post_ext._best_runbook_match("runbooks/fx.md", ["runbooks/fix.md"]) + assert answer_post_ext._resolve_path({"a": [{"b": 3}]}, "a[0].b") == 3 + assert answer_post_ext._snapshot_id({"snapshot_id": "snap-1"}) == "snap-1" + assert answer_post_ext._claims_to_payload([ClaimItem(id="c1", claim="atlas", evidence=[EvidenceItem(path="a.b", reason="r", value_at_claim=1)])]) + assert answer_post_ext._state_from_payload({"updated_at": 1.0, "claims": [{"id": "c1", "claim": "atlas", "evidence": [{"path": "a.b", "reason": "r"}]}]}) + assert answer_retrieval._metric_ctx_values({"summary_lines": summary_lines, "question": "cpu", "sub_questions": ["pods"], "keywords": ["cpu"], "keyword_tokens": ["cpu"]}) + assert answer_retrieval._extract_metric_keys(rich_lines) + assert answer_retrieval._token_variants({"nodes"}) + assert answer_retrieval._parse_key_list("{\"keys\":[\"nodes_total\"]}", ["nodes_total"], 1) == ["nodes_total"] + assert answer_retrieval._chunk_ids_for_keys([{"id": "c1", "text": "nodes_total: 2"}], ["nodes_total"]) == ["c1"] + assert answer_retrieval._filter_metric_keys(["nodes_total"], {"nodes"}) + assert answer_retrieval._metric_key_overlap(["nodes_total"], {"nodes"}) + assert answer_retrieval._lines_for_metric_keys(rich_lines, ["nodes_total"]) + assert answer_retrieval._merge_metric_keys(["nodes_total"], ["pods_total"], 3) + assert answer_retrieval._merge_fact_lines(["a"], ["b"]) + assert answer_retrieval._expand_hottest_line("hottest: cpu=titan-01 (90)") + answer_retrieval._has_token("hottest_cpu: titan-01=90", "cpu") + answer_retrieval._hotspot_evidence(snapshot) + assert asyncio.run(answer_retrieval._select_metric_chunks(scripted_llm, {"summary_lines": summary_lines, "question": "cpu", "sub_questions": ["pods"], "keywords": ["cpu"], "keyword_tokens": ["cpu"]}, chunks, fast_plan)) + asyncio.run(answer_retrieval._validate_metric_keys(scripted_llm, {"question": "cpu", "sub_questions": ["pods"], "selected": ["nodes_total"]}, ["nodes_total"], fast_plan)) + assert asyncio.run(answer_retrieval._gather_limited([asyncio.sleep(0, result=1), asyncio.sleep(0, result=2)], 1)) + assert answer_retrieval_ext._metric_key_tokens(summary_lines) + asyncio.run(answer_retrieval_ext._select_best_candidate(scripted_llm, "question", ["a", "b"], fast_plan, "chunk_select")) + assert answer_retrieval_ext._dedupe_lines(["x", "x", "y"]) + assert answer_retrieval_ext._collect_fact_candidates(chunks, 4) + assert asyncio.run(answer_retrieval_ext._select_best_list(scripted_llm, "question", [["a"], ["b"]], fast_plan, "chunk_select")) + assert asyncio.run(answer_retrieval_ext._extract_fact_types(scripted_llm, "question", ["cpu"], fast_plan)) + assert asyncio.run(answer_retrieval_ext._derive_signals(scripted_llm, "question", ["cpu"], fast_plan)) + assert asyncio.run(answer_retrieval_ext._scan_chunk_for_signals(scripted_llm, "question", ["cpu"], ["cpu: 90"], fast_plan)) + assert asyncio.run(answer_retrieval_ext._prune_metric_candidates(scripted_llm, "question", ["cpu: 90"], fast_plan, 1)) + assert asyncio.run(answer_retrieval_ext._select_fact_lines(scripted_llm, "question", ["cpu: 90"], fast_plan, 1)) + assert answer_spine._join_context(["a", "", "b"]) == "a\nb" + assert answer_spine._format_history([{"q": "q", "a": "a"}]) + assert answer_spine._summary_lines(snapshot) + assert answer_spine._line_starting_with(rich_lines, "nodes_total") + assert answer_spine._spine_lines(rich_lines) + spine_map: dict[str, str] = {} + answer_spine._spine_nodes(rich_lines, spine_map) + answer_spine._spine_hardware(rich_lines, spine_map) + answer_spine._spine_hottest(rich_lines, spine_map) + answer_spine._spine_postgres(rich_lines, spine_map) + answer_spine._spine_namespaces(rich_lines, spine_map) + answer_spine._spine_pressure(rich_lines, spine_map) + assert answer_spine._parse_group_line("hardware: rpi5=(titan-01)") + assert answer_spine._parse_hottest("hottest: cpu=titan-01 (90)", "cpu") + assert answer_spine._spine_answer(route_intent("How many nodes?"), "nodes_total: 2") + assert answer_spine._spine_nodes_answer("nodes_total: 2") + assert answer_spine._spine_non_rpi_answer("amd64 (titan-02)") + assert answer_spine._spine_hardware_answer("hardware: amd64=1") + assert answer_spine._spine_hottest_answer("hottest_cpu", "hottest: cpu=titan-01 (90)") + assert answer_spine._spine_postgres_answer("postgres_connections: used=5") + assert answer_spine._spine_namespace_answer("namespace_most_pods: synapse=5") + assert answer_spine._spine_pressure_answer("pressure_nodes: titan-02") + assert answer_spine._spine_from_summary(summary) + assert answer_spine._spine_from_counts(summary) + assert answer_spine._spine_from_hardware(summary) + assert answer_spine._spine_from_hottest(summary) + assert answer_spine._spine_from_postgres(summary) + assert answer_spine._spine_from_namespace_pods(summary) + assert answer_spine._spine_from_pressure(summary) + assert answer_spine._spine_fallback(route_intent("How many nodes?"), rich_lines) + + +def test_snapshot_builder_coverage_smoke() -> None: + """Exercise the split snapshot render helpers end to end.""" + + snapshot = _rich_snapshot() + summary = core_a.build_summary(snapshot) + text = summary_text(snapshot) + assert summary and text + lines: list[str] = [] + format_a._format_float(1.5) + format_a._format_rate_bytes(2048) + format_a._format_bytes(2048) + format_a._format_kv_map({"a": 1, "b": 2}) + format_a._format_names(["b", "a"]) + format_a._append_nodes(lines, summary) + format_a._append_hardware(lines, summary) + format_a._append_hardware_groups(lines, summary) + format_a._append_node_ages(lines, summary) + format_a._append_node_taints(lines, summary) + format_a._append_node_facts(lines, summary) + format_a._append_pressure(lines, summary) + format_a._append_pods(lines, summary) + format_a._append_capacity(lines, summary) + format_a._append_namespace_pods(lines, summary) + format_a._append_namespace_nodes(lines, summary) + format_a._append_node_pods(lines, summary) + format_a._append_pod_issues(lines, summary) + format_a._format_pod_issue_counts(summary["pod_issues"]) + format_a._format_pod_issue_top(summary["pod_issues"]) + format_a._format_pod_pending_oldest(summary["pod_issues"]) + format_a._format_pod_waiting_reasons(summary["pod_issues"]) + format_a._format_pod_pending_over_15m(summary["pod_issues"]) + format_a._append_workload_health(lines, summary) + format_a._append_node_usage_stats(lines, summary) + format_a._append_events(lines, summary) + format_a._append_pvc_usage(lines, summary) + format_a._append_root_disk_headroom(lines, summary) + format_b._append_longhorn(lines, summary) + format_b._append_namespace_usage(lines, summary) + format_b._append_namespace_requests(lines, summary) + format_b._append_namespace_io_net(lines, summary) + format_b._append_pod_usage(lines, summary) + format_b._append_restarts(lines, summary) + format_b._append_job_failures(lines, summary) + format_b._append_jobs(lines, summary) + format_b._format_jobs_totals(summary["jobs"]) + format_b._format_jobs_failing(summary["jobs"]) + format_b._format_jobs_active_oldest(summary["jobs"]) + format_b._append_postgres(lines, summary) + format_b._append_hottest(lines, summary) + format_b._append_workloads(lines, summary) + format_b._append_topology(lines, summary) + format_b._append_flux(lines, summary) + format_c._append_signals(lines, summary) + format_c._append_profiles(lines, summary) + format_c._append_units_windows(lines, summary) + format_c._append_node_load_summary(lines, summary) + format_c._append_hardware_usage(lines, summary) + format_c._append_cluster_watchlist(lines, summary) + format_c._append_baseline_deltas(lines, summary) + format_c._append_pod_issue_summary(lines, summary) + format_c._reason_line(summary["pod_issue_summary"]["waiting_reasons_top"], "waiting") + format_c._append_namespace_issue_lines(lines, summary["pod_issue_summary"]["namespace_issue_top"]) + format_c._build_cluster_watchlist(summary) + format_c._capacity_ratio_parts(summary["namespace_capacity"], "cpu", "cpu", "mem") + format_c._capacity_headroom_parts(summary["namespace_capacity"]) + format_c._append_namespace_capacity_summary(lines, summary) + format_c._append_workloads_by_namespace(lines, summary) + format_c._append_lexicon(lines, summary) + format_c._append_cross_stats(lines, summary) + assert lines + + +def test_answerer_helper_edge_branches(monkeypatch: pytest.MonkeyPatch) -> None: + """Cover alternate branches in the split answerer helper modules.""" + + settings = replace(build_test_settings(), debug_pipeline=True) + logged: list[tuple[str, dict[str, Any]]] = [] + monkeypatch.setattr(answer_common, "log", SimpleNamespace(info=lambda message, extra: logged.append((message, extra)))) + meta = answer_common._build_meta("custom", 1, 2, True, False, 3.0, {"kind": "x"}, {"cmd": "echo"}, 10.0) + assert meta["llm_limit_hit"] is True + answer_common._debug_pipeline_log(settings, "edge", {"ok": True}) + assert logged and logged[0][0] == "atlasbot_debug" + assert answer_common._mode_plan(settings, "genius").drafts == 2 + assert answer_common._mode_plan(settings, "custom").use_tool is False + assert answer_common._select_subquestions([None, {"question": "", "priority": "x"}], "fallback", 2) == ["fallback"] + assert answer_common._chunk_lines([], 3) == [] + assert answer_common._raw_snapshot_chunks({"ok": 1, "bad": {1, 2}}) + assert answer_common._build_chunk_groups([{"id": "c1", "summary": "a"}], 2) == [[{"id": "c1", "summary": "a"}]] + + async def score_call(_system: str, _prompt: str, *, model: str | None = None, tag: str = "", **_: Any) -> str: + if tag == "chunk_score": + return '[{"id":"c1","score":"bad"},{"id":"","score":5},"bad"]' + if tag == "chunk_select": + return '{"selected_index": 99}' + raise AssertionError(tag) + + groups = [[{"id": "c1", "summary": "a"}]] + ctx = ScoreContext(question="q", sub_questions=[], retries=1, parallelism=1, select_best=True, fast_model="fast") + assert asyncio.run(answer_common._score_chunk_group(score_call, groups[0], "q", [])) == {"c1": 0.0} + assert asyncio.run(answer_common._score_chunk_group_run(score_call, 0, groups[0], "q", [])) == (0, {"c1": 0.0}) + assert answer_common._merge_score_runs([]) == {} + assert asyncio.run(answer_common._select_best_score_run(score_call, groups[0], [{"c1": 1.0}, {"c1": 2.0}], ctx)) == {"c1": 1.0} + assert answer_common._keyword_hits([{"text": "cpu"}, {"text": "ram"}], {"text": "cpu"}, None) == [] + assert answer_common._select_chunks([], {}, answer_common._mode_plan(settings, "custom")) == [] + selected = [{"id": "c0", "text": "a"}] + assert answer_common._append_must_chunks([{"id": "c0"}, {"id": "c1"}], selected, ["c1"], 3) is False + assert answer_common._append_keyword_chunks([{"id": "c0", "text": "cpu"}], selected, ["cpu"], 2) is False + answer_common._append_ranked_chunks([{"id": "c1"}], selected, 2) + assert answer_common._format_runbooks([]) == "" + + async def retrieval_call(_system: str, _prompt: str, *, model: str | None = None, tag: str = "", **_: Any) -> str: + responses = { + "fact_types": '{"fact_types":["cpu", 5, "cpu"]}', + "fact_types_select": '{"best": 99}', + "signals": '{"signals":["cpu", "", "ram"]}', + "signals_select": '{"best": 99}', + "chunk_scan": '{"lines":["cpu: 1", "missing: 2"]}', + "chunk_scan_select": '{"best": 99}', + "fact_prune": '{"lines":["cpu: 1", "ram: 2"]}', + "fact_prune_select": '{"best": 99}', + "fact_select": '{"lines":["cpu: 1"]}', + "fact_select_best": '{"best": 99}', + } + return responses[tag] + + fast_plan = replace(answer_common._mode_plan(settings, "smart"), metric_retries=2) + assert answer_retrieval_ext._parse_json_block("plain", fallback={"ok": True}) == {"ok": True} + assert "nodes" in answer_retrieval_ext._metric_key_tokens(["nodes_total: 2"]) + assert answer_retrieval_ext._metric_key_tokens([123, "invalid", ": empty"]) == set() + assert asyncio.run(answer_retrieval_ext._select_best_candidate(retrieval_call, "q", ["one"], fast_plan, "fact_types_select")) == 0 + assert answer_retrieval_ext._dedupe_lines(["lexicon_term: a", "units: x", "cpu", "cpu"], limit=1) == ["cpu"] + assert answer_retrieval_ext._collect_fact_candidates([{"text": "cpu: 1\nram: 2"}, {"bad": True}], 3) == ["cpu: 1", "ram: 2"] + assert asyncio.run(answer_retrieval_ext._select_best_list(retrieval_call, "q", [[], ["cpu"]], fast_plan, "fact_types_select")) == ["cpu"] + assert asyncio.run(answer_retrieval_ext._extract_fact_types(retrieval_call, "q", [], fast_plan)) == ["cpu", "5"] + async def retrieval_bad(_system: str, _prompt: str, *, model: str | None = None, tag: str = "", **_: Any) -> str: + del _system, _prompt, model, tag + return '{"signals":"bad","fact_types":"bad","lines":"bad"}' + + assert asyncio.run(answer_retrieval_ext._extract_fact_types(retrieval_bad, "q", [], fast_plan)) == [] + assert asyncio.run(answer_retrieval_ext._derive_signals(retrieval_call, "q", [], fast_plan)) == [] + assert asyncio.run(answer_retrieval_ext._derive_signals(retrieval_bad, "q", ["cpu"], fast_plan)) == [] + assert asyncio.run(answer_retrieval_ext._derive_signals(retrieval_call, "q", ["cpu"], fast_plan)) == ["cpu", "ram"] + assert asyncio.run(answer_retrieval_ext._scan_chunk_for_signals(retrieval_call, "q", [], ["cpu: 1"], fast_plan)) == [] + assert asyncio.run(answer_retrieval_ext._scan_chunk_for_signals(retrieval_bad, "q", ["cpu"], ["cpu: 1"], fast_plan)) == [] + assert asyncio.run(answer_retrieval_ext._scan_chunk_for_signals(retrieval_call, "q", ["cpu"], ["cpu: 1", "ram: 2"], fast_plan)) == ["cpu: 1"] + assert asyncio.run(answer_retrieval_ext._prune_metric_candidates(retrieval_call, "q", [], fast_plan, 2)) == [] + assert asyncio.run(answer_retrieval_ext._prune_metric_candidates(retrieval_bad, "q", ["cpu: 1"], fast_plan, 2)) == [] + assert asyncio.run(answer_retrieval_ext._prune_metric_candidates(retrieval_call, "q", ["cpu: 1", "ram: 2"], fast_plan, 2)) == ["cpu: 1", "ram: 2"] + assert asyncio.run(answer_retrieval_ext._select_fact_lines(retrieval_call, "q", [], fast_plan, 1)) == [] + assert asyncio.run(answer_retrieval_ext._select_fact_lines(retrieval_bad, "q", ["cpu: 1"], fast_plan, 1)) == [] + assert asyncio.run(answer_retrieval_ext._select_fact_lines(retrieval_call, "q", ["cpu: 1", "ram: 2"], fast_plan, 1)) == ["cpu: 1"] + + async def post_call(_system: str, _prompt: str, *, model: str | None = None, tag: str = "", **_: Any) -> str: + if tag == "contradiction": + return '{"use_facts": false, "confidence": 70}' + if tag == "insight_guard": + return '{"ok": true}' + if tag == "insight_fix": + return "fixed" + raise AssertionError(tag) + + assert answer_post._strip_unknown_entities("", ["titan-99"], []) == "" + assert answer_post._strip_unknown_entities("Atlas is healthy.", [], []) == "Atlas is healthy." + assert answer_post._needs_evidence_guard("", ["fact"]) is False + assert answer_post._needs_evidence_guard("pressure is high", ["pressure"]) is False + contradiction = asyncio.run( + answer_post._contradiction_decision( + ContradictionContext(post_call, "q", "draft", ["fact"], fast_plan), + attempts=2, + ) + ) + assert contradiction["confidence"] == 70 + assert answer_post._format_direct_metric_line("broken line") == "broken line" + assert answer_post._global_facts([]) == [] + assert answer_post._has_keyword_overlap([], ["cpu"]) is False + assert answer_post._extract_question_tokens("") == [] + assert answer_post._expand_tokens([]) == [] + assert answer_post._ensure_token_coverage([], ["cpu"], ["cpu: 1"]) == [] + assert answer_post._best_keyword_line(["ram: 1"], ["cpu"]) is None + assert answer_post._line_starting_with([], "cpu") is None + assert answer_post._non_rpi_nodes({"hardware_by_node": None}) == {} + assert answer_post._format_hardware_groups({}, "Nodes") == "" + assert answer_post._lexicon_context({"lexicon": []}) == "" + assert answer_post._parse_json_list("nope") == [] + assert answer_post._scores_from_json({}).confidence == 60 + assert answer_post._coerce_int("bad", 5) == 5 + assert answer_post._style_hint({"question_type": "planning"}) == "insightful" + assert answer_post._needs_evidence_fix("", {"needs_snapshot": True}) is False + assert answer_post._should_use_insight_guard({"question_type": "planning"}) is True + insight = InsightGuardInput( + question="q", + reply="reply", + classify={"question_type": "planning"}, + context="ctx", + plan=fast_plan, + call_llm=post_call, + facts=[], + ) + assert asyncio.run(answer_post._apply_insight_guard(insight)) == "reply" + + assert answer_post_ext._reply_matches_metric_facts("no numbers", ["cpu: 1"]) is False + assert answer_post_ext._needs_dedup("short.") is False + assert answer_post_ext._needs_focus_fix("why", "direct", {"question_type": "open_ended"}) is False + assert answer_post_ext._extract_keywords("Q", "Q", [], []) == [] + assert answer_post_ext._allowed_nodes({}) == [] + assert answer_post_ext._allowed_namespaces({}) == [] + assert answer_post_ext._find_unknown_nodes("titan-01", ["titan-01"]) == [] + assert answer_post_ext._find_unknown_namespaces("namespace synapse", ["synapse"]) == [] + assert answer_post_ext._needs_runbook_fix("runbooks/fix.md", ["runbooks/fix.md"]) is False + assert answer_post_ext._needs_runbook_reference("status", ["runbooks/fix.md"], "ok") is False + assert answer_post_ext._best_runbook_match("x", []) is None + assert answer_post_ext._resolve_path({"a": []}, "a[1].b") is None + assert answer_post_ext._snapshot_id({"snapshot": {"id": "x"}}) is None + assert answer_post_ext._claims_to_payload([]) == [] + assert answer_post_ext._state_from_payload({}) is None + + assert answer_factsheet._factsheet_instruction("smart") + assert answer_factsheet._factsheet_model("quick", fast_plan) == fast_plan.fast_model + assert answer_factsheet._is_plain_math_question("2 + 2") is True + assert answer_factsheet._quick_fact_sheet_lines("where is runbook", ["runbooks/fix.md", "cpu: 1"], [], limit=1) + assert answer_factsheet._quick_fact_sheet_text([]) == "Fact Sheet:\n- No snapshot facts available." + assert "prefer rpi5 workers first" in answer_factsheet._quick_fact_sheet_heuristic_answer( + "what is the node placement last resort", + ["runbooks/fix.md"], + ) + assert "1 ready nodes out of 2 total" in answer_factsheet._quick_fact_sheet_heuristic_answer( + "how many ready nodes are there", + ["nodes_total:2,ready:1,not_ready:1"], + ) + + assert answer_spine._join_context([]) == "" + assert answer_spine._format_history([]) == "" + assert answer_spine._line_starting_with([], "cpu") is None + assert answer_spine._spine_lines([]) == {} + extra_spine: dict[str, str] = {} + answer_spine._spine_nodes(["nodes: total=2 ready=1 not_ready=1"], extra_spine) + answer_spine._spine_hardware(["hardware: amd64=1 (titan-02)"], extra_spine) + answer_spine._spine_hottest(["hottest: cpu=titan-01 [rpi5] (90%)"], extra_spine) + answer_spine._spine_postgres(["postgres_connections_total: used=5, max=10"], extra_spine) + answer_spine._spine_namespaces(["namespace_pods_top: synapse=5"], extra_spine) + answer_spine._spine_pressure(["pressure: nodes=0"], extra_spine) + assert answer_spine._parse_group_line("invalid") == {} + assert answer_spine._parse_hottest("broken", "cpu") is None + assert answer_spine._spine_nodes_answer("nodes: total=2 ready=1 not_ready=1") + assert answer_spine._spine_pressure_answer("pressure: nodes=0") + + +def test_runtime_and_snapshot_edge_branches(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + """Cover runtime wrappers and sparse snapshot builder branches.""" + + sparse_summary = { + "node_pods": [ + {"node": "titan-01", "pods_total": "7", "namespaces_top": [("synapse", 3), ("vault", 2)]}, + {"node": "titan-02", "pods_total": "x"}, + ], + "pod_issues": { + "counts": {"Failed": 1}, + "items": [{"namespace": "synapse", "pod": "matrix", "phase": "Pending", "restarts": 1}], + "pending_oldest": [{"namespace": "synapse", "pod": "matrix", "age_hours": 2, "reason": "Waiting"}], + "waiting_reasons": {"ImagePullBackOff": 2}, + "pending_over_15m": "2", + }, + "workloads_health": { + "deployments": {"not_ready": 1}, + "statefulsets": {"not_ready": 0}, + "daemonsets": {"not_ready": 1}, + }, + "topology": { + "nodes": [{"node": "titan-01", "workloads_top": [("matrix", 3)]}], + "workloads": [{"namespace": "synapse", "workload": "matrix", "nodes_top": [("titan-01", 3)]}], + }, + "flux": { + "not_ready": 2, + "items": [{"namespace": "flux-system", "name": "kustomization", "reason": "waiting", "suspended": True}], + }, + "namespace_capacity_summary": { + "cpu_ratio_top": [{"namespace": "synapse", "cpu_usage_ratio": 0.8, "cpu_usage": 4, "cpu_requests": 5}], + "mem_ratio_top": [{"namespace": "synapse", "mem_usage_ratio": 0.7, "mem_usage": 7, "mem_requests": 10}], + "cpu_headroom_low": [{"namespace": "synapse", "headroom": 0.2}], + "mem_headroom_low": [{"namespace": "synapse", "headroom": 0.3}], + "cpu_overcommitted": 1, + "mem_overcommitted": 1, + "cpu_overcommitted_names": ["synapse"], + "mem_overcommitted_names": ["vault"], + }, + "workloads": [{"namespace": "synapse", "workload": "matrix", "pods_total": 3, "primary_node": "titan-01"}], + "lexicon": {"terms": [{"term": "atlas", "meaning": "cluster"}], "aliases": {"bot": "atlas"}}, + "cross_stats": { + "node_metric_top": [{"metric": "cpu", "node": "titan-01", "value": 90, "cpu": 90, "ram": 80, "net": 1.0, "io": 2.0, "pods_total": 3}], + "namespace_metric_top": [{"metric": "cpu", "namespace": "synapse", "value": 40, "cpu_ratio": 0.8, "mem_ratio": 0.7, "pods_total": 3}], + "pvc_top": [{"namespace": "synapse", "pvc": "data", "used_percent": 95}], + }, + "events": {"warnings_total": 2}, + } + lines: list[str] = [] + format_a._append_node_pods(lines, sparse_summary) + format_a._append_pod_issues(lines, sparse_summary) + format_a._append_workload_health(lines, sparse_summary) + format_b._append_topology(lines, sparse_summary) + format_b._append_flux(lines, sparse_summary) + format_c._append_namespace_capacity_summary(lines, sparse_summary) + format_c._append_workloads_by_namespace(lines, sparse_summary) + format_c._append_lexicon(lines, sparse_summary) + format_c._append_cross_stats(lines, sparse_summary) + assert any("node_pods_max" in line for line in lines) + assert any("flux_not_ready_items" in line for line in lines) + assert any("cross_pvc_usage" in line for line in lines) + + assert core_a._build_node_ages([{"name": "titan-01", "age_hours": 1}, "bad"]) + assert core_a._build_node_facts([{"name": "titan-01", "is_worker": True, "roles": ["worker"], "arch": "arm64"}]) + assert core_a._build_node_taints([{"name": "titan-01", "taints": [{"key": "dedicated", "effect": "NoSchedule"}]}]) + assert core_a._build_root_disk_headroom({"node_usage": {"disk": [{"node": "titan-01", "value": 80}]}}) + assert core_a._build_longhorn({"longhorn": {"total": 1}}) + assert core_a._build_node_load({"node_load": [{"node": "titan-01"}]}) + assert core_a._build_pods({"pods_running": 1}) + assert core_a._build_capacity({"capacity_cpu": 4}) + assert core_a._build_namespace_pods({"namespace_pods": [{"namespace": "synapse"}]}) + assert core_a._build_namespace_nodes({"namespace_nodes": [{"namespace": "synapse"}]}) + assert core_a._build_node_pods({"node_pods": [{"node": "titan-01"}]}) + assert core_a._build_node_pods_top({"node_pods_top": [{"node": "titan-01"}]}) + assert core_a._build_pod_issues({"pod_issues": {"counts": {}}}) + assert core_a._build_events({"events": {"warnings_total": 1}}) + assert core_a._build_event_summary({"events": {"warnings_top_reason": {"a": 1}, "warnings_latest": [{"reason": "x"}]}}) + assert core_a._build_postgres({"postgres_connections": {"used": 1}}) + + settings = replace(build_test_settings(), queue_enabled=False) + store = ClaimStore(":memory:", 60) + assert store.get("") is None + store.set("", {"claims": []}) + assert _safe_json(None, {}) == {} + + kb_dir = tmp_path / "kb" + (kb_dir / "catalog").mkdir(parents=True) + (kb_dir / "catalog" / "runbooks.json").write_text(json.dumps([{"path": "runbooks/fix.md"}, {"title": "Missing path"}]), encoding="utf-8") + kb = KnowledgeBase(str(kb_dir)) + assert kb.runbook_titles() == "" + assert kb.runbook_paths(limit=1) == ["runbooks/fix.md"] + + from atlasbot.snapshot.builder import SnapshotProvider + + provider = SnapshotProvider(replace(settings, ariadne_state_url="", snapshot_ttl_sec=1)) + provider._cache = {"cached": True} + provider._cache_ts = 1.0 + monkeypatch.setattr("atlasbot.snapshot.builder.time.monotonic", lambda: 100.0) + assert provider.get() == {"cached": True} + + from atlasbot import main as main_mod + + captured: dict[str, Any] = {} + + class QueueProbe: + def __init__(self, _settings: Settings, handler): + captured["handler"] = handler + + async def start(self) -> None: + return None + + async def submit(self, payload: dict[str, Any]) -> dict[str, Any]: + return {"reply": payload.get("question", ""), "scores": {}} + + class ApiProbe: + def __init__(self, _settings: Settings, answer_handler): + captured["answer_handler"] = answer_handler + self.app = SimpleNamespace() + + class ServerProbe: + def __init__(self, config: Any): + self.config = config + + async def serve(self) -> None: + return None + + class EngineProbe: + async def answer( + self, + question: str, + *, + mode: str, + history: list[dict[str, str]] | None = None, + observer: Any = None, + conversation_id: str | None = None, + snapshot_pin: bool | None = None, + ) -> AnswerResult: + return AnswerResult( + reply=f"{question}:{mode}:{bool(history)}:{conversation_id}:{snapshot_pin}:{observer is not None}", + scores=AnswerScores(confidence=91, relevance=92, satisfaction=93, hallucination_risk="low"), + meta={}, + ) + + monkeypatch.setattr(main_mod, "load_settings", lambda: replace(settings, matrix_bots=())) + monkeypatch.setattr(main_mod, "configure_logging", lambda _level: None) + monkeypatch.setattr(main_mod, "_build_engine", lambda _settings: EngineProbe()) + monkeypatch.setattr(main_mod, "QueueManager", QueueProbe) + monkeypatch.setattr(main_mod, "Api", ApiProbe) + monkeypatch.setattr(main_mod.uvicorn, "Server", ServerProbe) + asyncio.run(main_mod.main()) + handled = asyncio.run(captured["handler"]({"question": "hello", "mode": "smart", "history": "bad", "conversation_id": 7, "snapshot_pin": "bad"})) + assert handled["reply"] + answered = asyncio.run(captured["answer_handler"]("hello", "quick", None, None, None, None)) + assert answered.reply + + assert result_scores({"scores": {"confidence": "bad"}}).confidence == 60 + + qm = QueueManager(replace(settings, queue_enabled=True), lambda payload: asyncio.sleep(0, result=payload)) + with pytest.raises(RuntimeError, match="queue not initialized"): + asyncio.run(qm.submit({"question": "x"})) + assert _mode_timeout_sec(settings, "genius") == settings.genius_time_budget_sec + assert _extract_mode("atlas hello", ("atlas",), "quick") == ("quick", "hello")