quality(atlasbot): enforce strict gate split

This commit is contained in:
jenkins 2026-04-21 00:53:47 -03:00
parent 6ecf531bac
commit b7543d7e57
44 changed files with 9781 additions and 5716 deletions

View File

@ -6,11 +6,13 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
WORKDIR /app WORKDIR /app
COPY requirements.txt /app/requirements.txt COPY requirements.txt /app/requirements.txt
COPY requirements-dev.txt /app/requirements-dev.txt COPY requirements-dev.txt /app/requirements-dev.txt
COPY pyproject.toml /app/pyproject.toml
RUN pip install --no-cache-dir -r /app/requirements.txt -r /app/requirements-dev.txt RUN pip install --no-cache-dir -r /app/requirements.txt -r /app/requirements-dev.txt
COPY atlasbot /app/atlasbot COPY atlasbot /app/atlasbot
FROM base AS test FROM base AS test
COPY testing /app/testing
COPY tests /app/tests COPY tests /app/tests
COPY scripts /app/scripts COPY scripts /app/scripts

4
Jenkinsfile vendored
View File

@ -75,6 +75,10 @@ spec:
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json' QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json' QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
} }
options {
disableConcurrentBuilds()
buildDiscarder(logRotator(daysToKeepStr: '30', numToKeepStr: '200', artifactDaysToKeepStr: '30', artifactNumToKeepStr: '120'))
}
stages { stages {
stage('Checkout') { stage('Checkout') {
steps { steps {

View File

@ -1,7 +1,6 @@
import logging import logging
from typing import Any
from collections.abc import Awaitable, Callable from collections.abc import Awaitable, Callable
from typing import Any
from fastapi import FastAPI, Header, HTTPException from fastapi import FastAPI, Header, HTTPException
from pydantic import BaseModel from pydantic import BaseModel
@ -29,6 +28,16 @@ class AnswerResponse(BaseModel):
class Api: class Api:
"""Expose the answer API and enforce the shared internal token.
Input:
- `settings`: runtime configuration, including the optional internal token;
- `answer_handler`: async adapter that answers a normalized question.
Output:
- registers the HTTP routes on `self.app`.
"""
def __init__( def __init__(
self, self,
settings: Settings, settings: Settings,

View File

@ -1,6 +1,7 @@
import os import os
from dataclasses import dataclass from dataclasses import dataclass
def _env_bool(name: str, default: str = "false") -> bool: def _env_bool(name: str, default: str = "false") -> bool:
value = os.getenv(name, default).strip().lower() value = os.getenv(name, default).strip().lower()
return value in {"1", "true", "yes", "y", "on"} return value in {"1", "true", "yes", "y", "on"}
@ -121,6 +122,12 @@ def _load_matrix_bots(bot_mentions: tuple[str, ...]) -> tuple[MatrixBotConfig, .
def load_settings() -> Settings: def load_settings() -> Settings:
"""Load process settings from environment variables.
Output:
- a fully populated `Settings` instance with defaults for missing values.
"""
bot_mentions = tuple( bot_mentions = tuple(
[ [
item.strip() item.strip()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,12 @@
"""Answer engine package."""
from ._base import *
from .common import *
from .engine import *
from .factsheet import *
from .post import *
from .post_ext import *
from .retrieval import *
from .retrieval_ext import *
from .spine import *
from .workflow import *

View File

@ -0,0 +1,116 @@
from __future__ import annotations
import logging
from collections.abc import Awaitable, Callable
from dataclasses import dataclass
from typing import Any
log = logging.getLogger(__name__)
FOLLOWUP_SHORT_WORDS = 6
TOKEN_MIN_LEN = 3
GENERIC_METRIC_TOKENS = {"atlas", "cluster", "kubernetes", "k8s", "titan", "lab"}
NS_ENTRY_MIN_LEN = 2
DEDUP_MIN_SENTENCES = 3
RUNBOOK_SIMILARITY_THRESHOLD = 0.4
BYTES_KB = 1024
BYTES_MB = 1024 * 1024
class LLMLimitReached(RuntimeError):
pass
class LLMTimeBudgetExceeded(RuntimeError):
pass
@dataclass
class AnswerScores:
confidence: int
relevance: int
satisfaction: int
hallucination_risk: str
@dataclass
class AnswerResult:
reply: str
scores: AnswerScores
meta: dict[str, Any]
@dataclass(frozen=True)
class InsightGuardInput:
question: str
reply: str
classify: dict[str, Any]
context: str
plan: ModePlan
call_llm: Callable[..., Awaitable[str]]
facts: list[str]
@dataclass
class ContradictionContext:
call_llm: Callable[..., Awaitable[str]]
question: str
reply: str
facts: list[str]
plan: ModePlan
@dataclass
class EvidenceItem:
path: str
reason: str
value: Any | None = None
value_at_claim: Any | None = None
@dataclass
class ClaimItem:
id: str
claim: str
evidence: list[EvidenceItem]
@dataclass
class ConversationState:
updated_at: float
claims: list[ClaimItem]
snapshot_id: str | None = None
snapshot: dict[str, Any] | None = None
@dataclass
class ModePlan:
model: str
fast_model: str
max_subquestions: int
chunk_lines: int
chunk_top: int
chunk_group: int
kb_max_chars: int
kb_max_files: int
use_raw_snapshot: bool
parallelism: int
score_retries: int
use_deep_retrieval: bool
use_tool: bool
use_critic: bool
use_gap: bool
use_scores: bool
drafts: int
metric_retries: int
subanswer_retries: int
@dataclass
class ScoreContext:
question: str
sub_questions: list[str]
retries: int
parallelism: int
select_best: bool
fast_model: str

View File

@ -0,0 +1,395 @@
from __future__ import annotations
import json
import time
from collections.abc import Awaitable, Callable
from typing import Any
from atlasbot.config import Settings
from atlasbot.llm import prompts
from atlasbot.llm.client import parse_json
from ._base import *
from .factsheet import *
from .post import *
from .post_ext import *
from .retrieval import _gather_limited
from .retrieval_ext import *
from .spine import *
def _strip_followup_meta(reply: str) -> str:
cleaned = reply.strip()
if not cleaned:
return cleaned
prefixes = [
"The draft is correct based on the provided context.",
"The draft is correct based on the context.",
"The draft is correct based on the provided evidence.",
"The draft is correct.",
"Based on the provided context,",
"Based on the context,",
"Based on the provided evidence,",
]
for prefix in prefixes:
if cleaned.lower().startswith(prefix.lower()):
cleaned = cleaned[len(prefix) :].lstrip(" .")
break
return cleaned
def _build_meta(mode: str, call_count: int, call_cap: int, limit_hit: bool, time_budget_hit: bool, time_budget_sec: float, classify: dict[str, Any], tool_hint: dict[str, Any] | None, started: float) -> dict[str, Any]:
return {
"mode": mode,
"llm_calls": call_count,
"llm_limit": call_cap,
"llm_limit_hit": limit_hit,
"time_budget_sec": time_budget_sec,
"time_budget_hit": time_budget_hit,
"classify": classify,
"tool_hint": tool_hint,
"elapsed_sec": round(time.monotonic() - started, 2),
}
def _debug_pipeline_log(settings: Settings, name: str, payload: Any) -> None:
"""Write a structured debug event when pipeline tracing is enabled."""
if not settings.debug_pipeline:
return
log.info("atlasbot_debug", extra={"extra": {"name": name, "payload": payload}})
def _mode_plan(settings: Settings, mode: str) -> ModePlan:
if mode == "genius":
return ModePlan(
model=settings.ollama_model_genius,
fast_model=settings.ollama_model_fast,
max_subquestions=6,
chunk_lines=6,
chunk_top=10,
chunk_group=4,
kb_max_chars=200000,
kb_max_files=200,
use_raw_snapshot=True,
parallelism=4,
score_retries=3,
use_deep_retrieval=True,
use_tool=True,
use_critic=True,
use_gap=True,
use_scores=True,
drafts=2,
metric_retries=3,
subanswer_retries=3,
)
if mode == "smart":
return ModePlan(
model=settings.ollama_model_smart,
fast_model=settings.ollama_model_fast,
max_subquestions=4,
chunk_lines=8,
chunk_top=8,
chunk_group=4,
kb_max_chars=3000,
kb_max_files=12,
use_raw_snapshot=False,
parallelism=2,
score_retries=2,
use_deep_retrieval=True,
use_tool=True,
use_critic=True,
use_gap=True,
use_scores=True,
drafts=1,
metric_retries=2,
subanswer_retries=2,
)
return ModePlan(
model=settings.ollama_model_fast,
fast_model=settings.ollama_model_fast,
max_subquestions=1,
chunk_lines=16,
chunk_top=3,
chunk_group=5,
kb_max_chars=800,
kb_max_files=4,
use_raw_snapshot=False,
parallelism=1,
score_retries=1,
use_deep_retrieval=False,
use_tool=False,
use_critic=False,
use_gap=False,
use_scores=False,
drafts=1,
metric_retries=1,
subanswer_retries=1,
)
def _llm_call_limit(settings: Settings, mode: str) -> int:
if mode == "genius":
return settings.genius_llm_calls_max
if mode == "smart":
return settings.smart_llm_calls_max
return settings.fast_llm_calls_max
def _mode_time_budget(settings: Settings, mode: str) -> float:
if mode == "genius":
return max(0.0, settings.genius_time_budget_sec)
if mode == "smart":
return max(0.0, settings.smart_time_budget_sec)
return max(0.0, settings.quick_time_budget_sec)
def _select_subquestions(parts: list[dict[str, Any]], fallback: str, limit: int) -> list[str]:
if not parts:
return [fallback]
ranked = []
for entry in parts:
if not isinstance(entry, dict):
continue
question = str(entry.get("question") or "").strip()
if not question:
continue
priority = entry.get("priority")
try:
weight = float(priority)
except (TypeError, ValueError):
weight = 1.0
ranked.append((weight, question))
ranked.sort(key=lambda item: item[0], reverse=True)
questions = [item[1] for item in ranked][:limit]
return questions or [fallback]
def _chunk_lines(lines: list[str], lines_per_chunk: int) -> list[dict[str, Any]]:
chunks: list[dict[str, Any]] = []
if not lines:
return chunks
for idx in range(0, len(lines), lines_per_chunk):
chunk_lines = lines[idx : idx + lines_per_chunk]
text = "\n".join(chunk_lines)
summary = " | ".join(chunk_lines[:4])
chunks.append({"id": f"c{idx//lines_per_chunk}", "text": text, "summary": summary})
return chunks
def _raw_snapshot_chunks(snapshot: dict[str, Any] | None) -> list[dict[str, Any]]:
if not isinstance(snapshot, dict) or not snapshot:
return []
chunks: list[dict[str, Any]] = []
for key, value in snapshot.items():
try:
payload = json.dumps({key: value}, indent=2)
except Exception:
continue
summary = f"raw:{key}"
chunks.append({"id": f"r{key}", "text": payload, "summary": summary})
return chunks
def _build_chunk_groups(chunks: list[dict[str, Any]], group_size: int) -> list[list[dict[str, Any]]]:
groups: list[list[dict[str, Any]]] = []
group: list[dict[str, Any]] = []
for chunk in chunks:
group.append({"id": chunk["id"], "summary": chunk["summary"]})
if len(group) >= group_size:
groups.append(group)
group = []
if group:
groups.append(group)
return groups
async def _score_chunks(call_llm: Callable[..., Any], chunks: list[dict[str, Any]], question: str, sub_questions: list[str], plan: ModePlan) -> dict[str, float]:
scores: dict[str, float] = {chunk["id"]: 0.0 for chunk in chunks}
if not chunks:
return scores
groups = _build_chunk_groups(chunks, plan.chunk_group)
ctx = ScoreContext(
question=question,
sub_questions=sub_questions,
retries=max(1, plan.score_retries),
parallelism=plan.parallelism,
select_best=plan.score_retries > 1,
fast_model=plan.fast_model,
)
if ctx.parallelism <= 1 or len(groups) * ctx.retries <= 1:
return await _score_groups_serial(call_llm, groups, ctx)
return await _score_groups_parallel(call_llm, groups, ctx)
async def _score_groups_serial(call_llm: Callable[..., Any], groups: list[list[dict[str, Any]]], ctx: ScoreContext) -> dict[str, float]:
scores: dict[str, float] = {}
for grp in groups:
runs = [await _score_chunk_group(call_llm, grp, ctx.question, ctx.sub_questions) for _ in range(ctx.retries)]
if ctx.select_best and len(runs) > 1:
best = await _select_best_score_run(call_llm, grp, runs, ctx)
scores.update(best)
else:
scores.update(_merge_score_runs(runs))
return scores
async def _score_groups_parallel(call_llm: Callable[..., Any], groups: list[list[dict[str, Any]]], ctx: ScoreContext) -> dict[str, float]:
coros: list[Awaitable[tuple[int, dict[str, float]]]] = []
for idx, grp in enumerate(groups):
for _ in range(ctx.retries):
coros.append(_score_chunk_group_run(call_llm, idx, grp, ctx.question, ctx.sub_questions))
results = await _gather_limited(coros, ctx.parallelism)
grouped: dict[int, list[dict[str, float]]] = {}
for idx, result in results:
grouped.setdefault(idx, []).append(result)
scores: dict[str, float] = {}
for idx, runs in grouped.items():
if ctx.select_best and len(runs) > 1:
group = groups[idx]
best = await _select_best_score_run(call_llm, group, runs, ctx)
scores.update(best)
else:
scores.update(_merge_score_runs(runs))
return scores
async def _score_chunk_group(call_llm: Callable[..., Any], group: list[dict[str, Any]], question: str, sub_questions: list[str]) -> dict[str, float]:
prompt = (
prompts.CHUNK_SCORE_PROMPT
+ "\nQuestion: "
+ question
+ "\nSubQuestions: "
+ json.dumps(sub_questions)
+ "\nChunks: "
+ json.dumps(group)
)
raw = await call_llm(prompts.RETRIEVER_SYSTEM, prompt, model=None, tag="chunk_score")
data = _parse_json_list(raw)
scored: dict[str, float] = {}
for entry in data:
if not isinstance(entry, dict):
continue
cid = str(entry.get("id") or "").strip()
if not cid:
continue
try:
score = float(entry.get("score") or 0)
except (TypeError, ValueError):
score = 0.0
scored[cid] = score
return scored
async def _score_chunk_group_run(call_llm: Callable[..., Any], idx: int, group: list[dict[str, Any]], question: str, sub_questions: list[str]) -> tuple[int, dict[str, float]]:
return idx, await _score_chunk_group(call_llm, group, question, sub_questions)
def _merge_score_runs(runs: list[dict[str, float]]) -> dict[str, float]:
if not runs:
return {}
totals: dict[str, float] = {}
counts: dict[str, int] = {}
for run in runs:
for key, value in run.items():
totals[key] = totals.get(key, 0.0) + float(value)
counts[key] = counts.get(key, 0) + 1
return {key: totals[key] / counts[key] for key in totals}
async def _select_best_score_run(call_llm: Callable[..., Any], group: list[dict[str, Any]], runs: list[dict[str, float]], ctx: ScoreContext) -> dict[str, float]:
if not runs:
return {}
prompt = (
prompts.RETRIEVER_SELECT_PROMPT
+ "\nQuestion: "
+ ctx.question
+ "\nSubQuestions: "
+ json.dumps(ctx.sub_questions)
+ "\nChunks: "
+ json.dumps(group)
+ "\nRuns: "
+ json.dumps(runs)
)
raw = await call_llm(prompts.RETRIEVER_SELECT_SYSTEM, prompt, model=ctx.fast_model, tag="chunk_select")
data = parse_json(raw)
idx = 0
if isinstance(data, dict):
try:
idx = int(data.get("selected_index") or 0)
except (TypeError, ValueError):
idx = 0
if idx < 0 or idx >= len(runs):
idx = 0
return runs[idx]
def _keyword_hits(ranked: list[dict[str, Any]], head: dict[str, Any], keywords: list[str] | None) -> list[dict[str, Any]]:
if not keywords:
return []
lowered = [kw.lower() for kw in keywords if isinstance(kw, str) and kw.strip()]
if not lowered:
return []
hits: list[dict[str, Any]] = []
for item in ranked:
if item is head:
continue
text = str(item.get("text") or "").lower()
if any(kw in text for kw in lowered):
hits.append(item)
return hits
def _select_chunks(chunks: list[dict[str, Any]], scores: dict[str, float], plan: ModePlan, keywords: list[str] | None = None, must_ids: list[str] | None = None) -> list[dict[str, Any]]:
if not chunks:
return []
ranked = sorted(chunks, key=lambda item: scores.get(item["id"], 0.0), reverse=True)
selected: list[dict[str, Any]] = [chunks[0]]
if _append_must_chunks(chunks, selected, must_ids, plan.chunk_top):
return selected
if _append_keyword_chunks(ranked, selected, keywords, plan.chunk_top):
return selected
_append_ranked_chunks(ranked, selected, plan.chunk_top)
return selected
def _append_must_chunks(chunks: list[dict[str, Any]], selected: list[dict[str, Any]], must_ids: list[str] | None, limit: int) -> bool:
if not must_ids:
return False
id_map = {item["id"]: item for item in chunks}
for cid in must_ids:
item = id_map.get(cid)
if item and item not in selected:
selected.append(item)
if len(selected) >= limit:
return True
return False
def _append_keyword_chunks(ranked: list[dict[str, Any]], selected: list[dict[str, Any]], keywords: list[str] | None, limit: int) -> bool:
if not ranked:
return False
head = ranked[0]
for item in _keyword_hits(ranked, head, keywords):
if item not in selected:
selected.append(item)
if len(selected) >= limit:
return True
return False
def _append_ranked_chunks(ranked: list[dict[str, Any]], selected: list[dict[str, Any]], limit: int) -> None:
for item in ranked:
if len(selected) >= limit:
break
if item not in selected:
selected.append(item)
def _format_runbooks(runbooks: list[str]) -> str:
if not runbooks:
return ""
return "Relevant runbooks:\n" + "\n".join([f"- {item}" for item in runbooks])
__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")]

View File

@ -0,0 +1,267 @@
from __future__ import annotations
from collections.abc import Callable
import json
import time
from typing import Any
from atlasbot.config import Settings
from atlasbot.knowledge.loader import KnowledgeBase
from atlasbot.llm import prompts
from atlasbot.llm.client import LLMClient, build_messages
from atlasbot.snapshot.builder import SnapshotProvider
from atlasbot.state.store import ClaimStore
from ._base import *
from .common import *
from .factsheet import *
from .post import *
from .post_ext import *
from .retrieval import *
from .retrieval_ext import *
from .spine import *
from .workflow import run_answer
class AnswerEngine:
"""Coordinate Atlas question answering across snapshots, KB, and LLMs.
Why:
- keep the public answer surface in one place while the retrieval and
post-processing helpers stay split across smaller modules.
"""
def __init__(self, settings: Settings, llm: LLMClient, kb: KnowledgeBase, snapshot: SnapshotProvider) -> None:
self._settings = settings
self._llm = llm
self._kb = kb
self._snapshot = snapshot
self._store = ClaimStore(settings.state_db_path, settings.conversation_ttl_sec)
async def answer(self, question: str, *, mode: str, history: list[dict[str, str]] | None = None, observer: Callable[[str, str], None] | None = None, conversation_id: str | None = None, snapshot_pin: bool | None = None) -> AnswerResult:
"""Answer a question by delegating to the staged workflow."""
return await run_answer(
self,
question,
mode=mode,
history=history,
observer=observer,
conversation_id=conversation_id,
snapshot_pin=snapshot_pin,
)
async def _answer_stock(self, question: str) -> AnswerResult:
messages = build_messages(prompts.STOCK_SYSTEM, question)
reply = await self._llm.chat(messages, model=self._settings.ollama_model)
return AnswerResult(reply, _default_scores(), {"mode": "stock"})
async def _synthesize_answer(self, question: str, subanswers: list[str], context: str, classify: dict[str, Any], plan: ModePlan, call_llm: Callable[..., Any]) -> str:
style_hint = _style_hint(classify)
if not subanswers:
prompt = (
prompts.SYNTHESIZE_PROMPT
+ "\nQuestion: "
+ question
+ "\nStyle: "
+ style_hint
+ "\nQuestionType: "
+ (classify.get("question_type") or "unknown")
)
return await call_llm(prompts.SYNTHESIZE_SYSTEM, prompt, context=context, model=plan.model, tag="synth")
draft_prompts = []
for idx in range(plan.drafts):
draft_prompts.append(
prompts.SYNTHESIZE_PROMPT
+ "\nQuestion: "
+ question
+ "\nStyle: "
+ style_hint
+ "\nQuestionType: "
+ (classify.get("question_type") or "unknown")
+ "\nSubanswers:\n"
+ "\n".join([f"- {item}" for item in subanswers])
+ f"\nDraftIndex: {idx + 1}"
)
drafts: list[str] = []
if plan.parallelism > 1 and len(draft_prompts) > 1:
drafts = await _gather_limited(
[
call_llm(
prompts.SYNTHESIZE_SYSTEM,
prompt,
context=context,
model=plan.model,
tag="synth",
)
for prompt in draft_prompts
],
plan.parallelism,
)
else:
for prompt in draft_prompts:
drafts.append(
await call_llm(
prompts.SYNTHESIZE_SYSTEM,
prompt,
context=context,
model=plan.model,
tag="synth",
)
)
if len(drafts) == 1:
return drafts[0]
select_prompt = (
prompts.DRAFT_SELECT_PROMPT
+ "\nQuestion: "
+ question
+ "\nDrafts:\n"
+ "\n\n".join([f"Draft {idx + 1}: {text}" for idx, text in enumerate(drafts)])
)
select_raw = await call_llm(prompts.CRITIC_SYSTEM, select_prompt, context=context, model=plan.fast_model, tag="draft_select")
selection = _parse_json_block(select_raw, fallback={})
idx = int(selection.get("best", 1)) - 1
if 0 <= idx < len(drafts):
return drafts[idx]
return drafts[0]
async def _score_answer(self, question: str, reply: str, plan: ModePlan, call_llm: Callable[..., Any]) -> AnswerScores:
if not plan.use_scores:
return _default_scores()
prompt = prompts.SCORE_PROMPT + "\nQuestion: " + question + "\nAnswer: " + reply
raw = await call_llm(prompts.SCORE_SYSTEM, prompt, model=plan.fast_model, tag="score")
data = _parse_json_block(raw, fallback={})
return _scores_from_json(data)
async def _extract_claims(self, question: str, reply: str, summary: dict[str, Any], facts_used: list[str], call_llm: Callable[..., Any]) -> list[ClaimItem]:
if not reply or not summary:
return []
summary_json = _json_excerpt(summary)
facts_used = [line.strip() for line in (facts_used or []) if line and line.strip()]
facts_block = ""
if facts_used:
facts_block = "\nFactsUsed:\n" + "\n".join([f"- {line}" for line in facts_used[:12]])
prompt = prompts.CLAIM_MAP_PROMPT + "\nQuestion: " + question + "\nAnswer: " + reply + facts_block
raw = await call_llm(
prompts.CLAIM_SYSTEM,
prompt,
context=f"SnapshotSummaryJson:{summary_json}",
model=self._settings.ollama_model_fast,
tag="claim_map",
)
data = _parse_json_block(raw, fallback={})
claims_raw = data.get("claims") if isinstance(data, dict) else None
claims: list[ClaimItem] = []
if isinstance(claims_raw, list):
for entry in claims_raw:
if not isinstance(entry, dict):
continue
claim_text = str(entry.get("claim") or "").strip()
claim_id = str(entry.get("id") or "").strip() or f"c{len(claims)+1}"
evidence_items: list[EvidenceItem] = []
for ev in entry.get("evidence") or []:
if not isinstance(ev, dict):
continue
path = str(ev.get("path") or "").strip()
if not path:
continue
reason = str(ev.get("reason") or "").strip()
value = _resolve_path(summary, path)
evidence_items.append(EvidenceItem(path=path, reason=reason, value=value, value_at_claim=value))
if claim_text and evidence_items:
claims.append(ClaimItem(id=claim_id, claim=claim_text, evidence=evidence_items))
return claims
async def _dedup_reply(self, reply: str, plan: ModePlan, call_llm: Callable[..., Any], tag: str) -> str:
if not _needs_dedup(reply):
return reply
dedup_prompt = prompts.DEDUP_PROMPT + "\nDraft: " + reply
return await call_llm(prompts.DEDUP_SYSTEM, dedup_prompt, model=plan.fast_model, tag=tag)
async def _answer_followup(self, question: str, state: ConversationState, summary: dict[str, Any], classify: dict[str, Any], plan: ModePlan, call_llm: Callable[..., Any]) -> str: # noqa: C901, ARG002
claim_ids = await self._select_claims(question, state.claims, plan, call_llm)
selected = [claim for claim in state.claims if claim.id in claim_ids] if claim_ids else state.claims[:2]
evidence_lines = []
lowered = question.lower()
for claim in selected:
evidence_lines.append(f"Claim: {claim.claim}")
for ev in claim.evidence:
current = _resolve_path(summary, ev.path)
ev.value = current
delta_note = ""
if ev.value_at_claim is not None and current is not None and current != ev.value_at_claim:
delta_note = f" (now {current})"
evidence_lines.append(f"- {ev.path}: {ev.value_at_claim}{delta_note}")
if any(term in lowered for term in ("hotspot", "hot spot", "hottest", "jetson", "rpi", "amd64", "arm64", "hardware", "class")):
hotspot_lines = _hotspot_evidence(summary)
if hotspot_lines:
evidence_lines.append("HotspotSummary:")
evidence_lines.extend(hotspot_lines)
evidence_ctx = "\n".join(evidence_lines)
prompt = prompts.FOLLOWUP_PROMPT + "\nFollow-up: " + question + "\nEvidence:\n" + evidence_ctx
reply = await call_llm(prompts.FOLLOWUP_SYSTEM, prompt, model=plan.model, tag="followup")
allowed_nodes = _allowed_nodes(summary)
allowed_namespaces = _allowed_namespaces(summary)
unknown_nodes = _find_unknown_nodes(reply, allowed_nodes)
unknown_namespaces = _find_unknown_namespaces(reply, allowed_namespaces)
extra_bits = []
if unknown_nodes:
extra_bits.append("UnknownNodes: " + ", ".join(sorted(unknown_nodes)))
if unknown_namespaces:
extra_bits.append("UnknownNamespaces: " + ", ".join(sorted(unknown_namespaces)))
if allowed_nodes:
extra_bits.append("AllowedNodes: " + ", ".join(allowed_nodes))
if allowed_namespaces:
extra_bits.append("AllowedNamespaces: " + ", ".join(allowed_namespaces))
if extra_bits:
fix_prompt = (
prompts.EVIDENCE_FIX_PROMPT
+ "\nQuestion: "
+ question
+ "\nDraft: "
+ reply
+ "\n"
+ "\n".join(extra_bits)
)
reply = await call_llm(
prompts.EVIDENCE_FIX_SYSTEM,
fix_prompt,
context="Evidence:\n" + evidence_ctx,
model=plan.model,
tag="followup_fix",
)
reply = await self._dedup_reply(reply, plan, call_llm, tag="dedup_followup")
reply = _strip_followup_meta(reply)
return reply
async def _select_claims(self, question: str, claims: list[ClaimItem], plan: ModePlan, call_llm: Callable[..., Any]) -> list[str]:
if not claims:
return []
claims_brief = [{"id": claim.id, "claim": claim.claim} for claim in claims]
prompt = prompts.SELECT_CLAIMS_PROMPT + "\nFollow-up: " + question + "\nClaims: " + json.dumps(claims_brief)
raw = await call_llm(prompts.FOLLOWUP_SYSTEM, prompt, model=plan.fast_model, tag="select_claims")
data = _parse_json_block(raw, fallback={})
ids = data.get("claim_ids") if isinstance(data, dict) else []
if isinstance(ids, list):
return [str(item) for item in ids if item]
return []
def _get_state(self, conversation_id: str | None) -> ConversationState | None:
if not conversation_id:
return None
state_payload = self._store.get(conversation_id)
return _state_from_payload(state_payload) if state_payload else None
def _store_state(self, conversation_id: str, claims: list[ClaimItem], summary: dict[str, Any], snapshot: dict[str, Any] | None, pin_snapshot: bool) -> None:
snapshot_id = _snapshot_id(summary)
pinned_snapshot = snapshot if pin_snapshot else None
payload = {
"updated_at": time.monotonic(),
"claims": _claims_to_payload(claims),
"snapshot_id": snapshot_id,
"snapshot": pinned_snapshot,
}
self._store.set(conversation_id, payload)
def _cleanup_state(self) -> None:
self._store.cleanup()

View File

@ -0,0 +1,189 @@
from __future__ import annotations
import json
import re
from typing import Any
from ._base import *
MAX_FACT_LINE_CHARS = 180
MAX_KB_LINE_CHARS = 220
def _factsheet_kb_chars(mode: str, default_chars: int) -> int:
if mode == "genius":
return min(max(default_chars, 4000), 6000)
if mode == "smart":
return min(max(default_chars, 3000), 4500)
return max(1200, default_chars)
def _factsheet_line_limit(mode: str) -> int:
if mode == "genius":
return 30
if mode == "smart":
return 22
return 14
def _factsheet_instruction(mode: str) -> str:
if mode == "genius":
return (
"Start with a direct conclusion, then include the strongest supporting facts and one caveat. "
"Keep it to 4-8 sentences. If data is missing, name the missing metric explicitly."
)
if mode == "smart":
return (
"Start with a direct conclusion and support it with key facts. Keep it to 2-5 sentences. "
"If data is missing, say exactly what is missing and suggest atlas-genius."
)
return "Keep it to 1-3 sentences. If key data is missing, say what is missing and suggest atlas-smart."
def _factsheet_model(mode: str, plan: ModePlan) -> str:
if mode in {"quick", "fast"}:
return plan.fast_model
return plan.model
def _is_plain_math_question(question: str) -> bool:
lowered = question.lower().strip()
if not lowered:
return False
cluster_markers = (
"titan",
"atlas",
"cluster",
"node",
"pod",
"namespace",
"workload",
"grafana",
"alert",
"k8s",
"kubernetes",
"rpi",
"longhorn",
"postgres",
"victoria",
"ollama",
)
if any(token in lowered for token in cluster_markers):
return False
return bool(
re.fullmatch(r"[0-9\s+\-*/().=]+", lowered)
or re.search(r"\bwhat(?:'s| is)\s+\d+\s*[-+*/]\s*\d+\b", lowered)
)
def _quick_fact_sheet_lines(question: str, summary_lines: list[str], kb_lines: list[str], *, limit: int) -> list[str]: # noqa: C901
tokens = {
token
for token in re.findall(r"[a-z0-9][a-z0-9_-]{2,}", question.lower())
if token not in GENERIC_METRIC_TOKENS
}
priority_markers = (
"snapshot:",
"nodes_total",
"nodes_ready",
"nodes_not_ready",
"workers_ready",
"workers_not_ready",
"control_plane",
"worker_nodes",
"hottest",
"postgres",
"pods",
"longhorn",
"titan-",
"rpi5",
"rpi4",
"jetson",
"amd64",
)
scored: list[tuple[int, str]] = []
for raw in summary_lines:
line = raw.strip()
if not line:
continue
lowered = line.lower()
score = 0
if any(marker in lowered for marker in priority_markers):
score += 4
overlap = sum(1 for token in tokens if token in lowered)
score += overlap * 3
if len(line) <= MAX_FACT_LINE_CHARS:
score += 1
if score > 0:
scored.append((score, line))
scored.sort(key=lambda item: item[0], reverse=True)
selected = [line for _, line in scored[:limit]]
if not selected:
selected = [line.strip() for line in summary_lines if line.strip()][:limit]
kb_selected: list[str] = []
for raw in kb_lines:
line = raw.strip()
if not line or len(line) > MAX_KB_LINE_CHARS:
continue
lowered = line.lower()
if "kb file:" in lowered or "kb: atlas.json" in lowered:
continue
overlap = sum(1 for token in tokens if token in lowered)
if overlap > 0 or any(marker in lowered for marker in ("runbook", "titan-", "rpi5", "rpi4", "amd64", "jetson")):
kb_selected.append(line)
if len(kb_selected) >= max(4, limit // 3):
break
merged = []
seen: set[str] = set()
for line in selected + kb_selected:
if line not in seen:
seen.add(line)
merged.append(line)
if len(merged) >= limit:
break
return merged
def _quick_fact_sheet_text(lines: list[str]) -> str:
if not lines:
return "Fact Sheet:\n- No snapshot facts available."
body = "\n".join([f"- {line}" for line in lines])
return "Fact Sheet:\n" + body
def _quick_fact_sheet_heuristic_answer(question: str, fact_lines: list[str]) -> str:
lowered = question.lower()
if (
any(token in lowered for token in ("placement", "schedule", "last resort", "last-resort"))
and any(token in lowered for token in ("node", "workload", "worker", "titan"))
):
return (
"General workload placement is: prefer rpi5 workers first, then rpi4 workers. "
"titan-22 is the last-resort general compute node, and titan-24 is the absolute last resort "
"reserved for heavy one-offs."
)
for line in fact_lines:
compact = line.replace(" ", "")
match = re.search(r"nodes_total[:=](\d+),ready[:=](\d+),not_ready[:=](\d+)", compact)
if not match:
continue
total = match.group(1)
ready = match.group(2)
not_ready = match.group(3)
if "how many" in lowered and "ready" in lowered and "node" in lowered:
return f"The latest snapshot shows {ready} ready nodes out of {total} total ({not_ready} not ready)."
if ("not ready" in lowered or "unready" in lowered) and "node" in lowered:
return f"The latest snapshot shows {not_ready} not-ready nodes ({ready} ready out of {total} total)."
return ""
def _json_excerpt(summary: dict[str, Any], max_chars: int = 12000) -> str:
raw = json.dumps(summary, ensure_ascii=False)
return raw[:max_chars]
__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")]

View File

@ -0,0 +1,459 @@
from __future__ import annotations
import re
from typing import Any
from atlasbot.llm import prompts
from atlasbot.llm.client import parse_json
from ._base import *
from .retrieval_ext import _dedupe_lines
def _merge_fact_lines(primary: list[str], fallback: list[str]) -> list[str]:
merged: list[str] = []
for line in primary + fallback:
value = (line or "").strip()
if value and value not in merged:
merged.append(value)
return merged
def _strip_unknown_entities(reply: str, unknown_nodes: list[str], unknown_namespaces: list[str]) -> str:
if not reply:
return reply
if not unknown_nodes and not unknown_namespaces:
return reply
sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", reply) if s.strip()]
if not sentences:
return reply
lowered_nodes = [node.lower() for node in unknown_nodes]
lowered_namespaces = [ns.lower() for ns in unknown_namespaces]
kept: list[str] = []
for sent in sentences:
lower = sent.lower()
if lowered_nodes and any(node in lower for node in lowered_nodes):
continue
if lowered_namespaces and any(f"namespace {ns}" in lower for ns in lowered_namespaces):
continue
kept.append(sent)
cleaned = " ".join(kept).strip()
return cleaned or reply
def _needs_evidence_guard(reply: str, facts: list[str]) -> bool:
if not reply or not facts:
return False
lower_reply = reply.lower()
fact_text = " ".join(facts).lower()
node_pattern = re.compile(r"\b(titan-[0-9a-z]+|node-?\d+)\b", re.IGNORECASE)
nodes = {m.group(1).lower() for m in node_pattern.finditer(reply)}
if nodes:
missing = [node for node in nodes if node not in fact_text]
if missing:
return True
pressure_terms = ("pressure", "diskpressure", "memorypressure", "pidpressure", "headroom")
if any(term in lower_reply for term in pressure_terms) and not any(term in fact_text for term in pressure_terms):
return True
arch_terms = ("amd64", "arm64", "rpi", "rpi4", "rpi5", "jetson")
return any(term in lower_reply for term in arch_terms) and not any(term in fact_text for term in arch_terms)
async def _contradiction_decision(ctx: ContradictionContext, attempts: int = 1) -> dict[str, Any]:
best = {"use_facts": True, "confidence": 50}
facts_block = "\n".join(ctx.facts[:12])
for idx in range(max(1, attempts)):
variant = f"Variant: {idx + 1}" if attempts > 1 else ""
prompt = (
prompts.CONTRADICTION_PROMPT.format(question=ctx.question, draft=ctx.reply, facts=facts_block)
+ ("\n" + variant if variant else "")
)
raw = await ctx.call_llm(
prompts.CONTRADICTION_SYSTEM,
prompt,
model=ctx.plan.fast_model,
tag="contradiction",
)
data = _parse_json_block(raw, fallback={})
try:
confidence = int(data.get("confidence", 50))
except Exception:
confidence = 50
use_facts = bool(data.get("use_facts", True))
if confidence >= best.get("confidence", 0):
best = {"use_facts": use_facts, "confidence": confidence}
return best
def _filter_lines_by_keywords(lines: list[str], keywords: list[str], max_lines: int) -> list[str]:
if not lines:
return []
tokens = _expand_tokens(keywords)
if not tokens:
return lines[:max_lines]
filtered = [line for line in lines if any(tok in line.lower() for tok in tokens)]
return (filtered or lines)[:max_lines]
def _rank_metric_lines(lines: list[str], tokens: set[str], max_lines: int) -> list[str]:
if not lines or not tokens:
return []
ranked: list[tuple[int, int, str]] = []
for line in lines:
lower = line.lower()
hits = sum(1 for tok in tokens if tok in lower)
if not hits:
continue
has_number = 1 if re.search(r"\d", line) else 0
ranked.append((has_number, hits, line))
ranked.sort(key=lambda item: (-item[0], -item[1], item[2]))
return [item[2] for item in ranked[:max_lines]]
def _select_metric_line(lines: list[str], question: str, tokens: list[str] | set[str]) -> str | None:
if not lines or not tokens:
return None
token_set = {str(tok).lower() for tok in tokens if tok}
ranked = _rank_metric_lines(lines, token_set, max_lines=6)
if not ranked:
return None
question_lower = (question or "").lower()
if any(term in question_lower for term in ("how many", "count", "total")):
for line in ranked:
lower = line.lower()
if "total" in lower or "count" in lower:
return line
return ranked[0]
def _format_direct_metric_line(line: str) -> str:
if not line:
return ""
if ":" in line:
formatted = _format_colon_metric(line)
if formatted:
return formatted
if "=" in line:
formatted = _format_equals_metric(line)
if formatted:
return formatted
return line
def _format_colon_metric(line: str) -> str | None:
key, value = line.split(":", 1)
key = key.strip().replace("_", " ")
value = value.strip()
if not value:
return None
if key == "nodes":
formatted = _format_nodes_value(value)
if formatted:
return formatted
if key in {"nodes total", "nodes_total"}:
return f"Atlas has {value} total nodes."
return f"{key} is {value}."
def _format_equals_metric(line: str) -> str | None:
pairs: list[str] = []
for part in line.split(","):
if "=" not in part:
continue
key, value = part.split("=", 1)
key = key.strip().replace("_", " ")
value = value.strip()
if not value:
continue
if key in {"nodes total", "nodes_total"}:
return f"Atlas has {value} total nodes."
pairs.append(f"{key} is {value}")
if not pairs:
return None
if len(pairs) == 1:
return f"{pairs[0]}."
return "; ".join(pairs) + "."
def _format_nodes_value(value: str) -> str | None:
parts = [p.strip() for p in value.split(",") if p.strip()]
total = None
rest: list[str] = []
for part in parts:
if part.startswith("total="):
total = part.split("=", 1)[1]
else:
rest.append(part.replace("_", " "))
if not total:
return None
if rest:
return f"Atlas has {total} total nodes ({'; '.join(rest)})."
return f"Atlas has {total} total nodes."
def _global_facts(lines: list[str]) -> list[str]:
if not lines:
return []
wanted = ("nodes_total", "nodes_ready", "cluster_name", "cluster", "nodes_not_ready")
facts: list[str] = []
for line in lines:
lower = line.lower()
if any(key in lower for key in wanted):
facts.append(line)
return _dedupe_lines(facts, limit=6)
def _has_keyword_overlap(lines: list[str], keywords: list[str]) -> bool:
if not lines or not keywords:
return False
tokens = _expand_tokens(keywords)
if not tokens:
return False
for line in lines:
lower = line.lower()
if any(tok in lower for tok in tokens):
return True
return False
def _merge_tokens(primary: list[str], secondary: list[str], third: list[str] | None = None) -> list[str]:
merged: list[str] = []
for token in primary + secondary + (third or []):
if not token:
continue
if token not in merged:
merged.append(token)
return merged
def _extract_question_tokens(question: str) -> list[str]:
if not question:
return []
tokens: list[str] = []
for part in re.split(r"[^a-zA-Z0-9_-]+", question.lower()):
if len(part) < TOKEN_MIN_LEN:
continue
if part not in tokens:
tokens.append(part)
return tokens
def _expand_tokens(tokens: list[str]) -> list[str]:
if not tokens:
return []
expanded: list[str] = []
for token in tokens:
if not isinstance(token, str):
continue
for part in re.split(r"[^a-zA-Z0-9_-]+", token.lower()):
if len(part) < TOKEN_MIN_LEN:
continue
if part not in expanded:
expanded.append(part)
return expanded
def _ensure_token_coverage(lines: list[str], tokens: list[str], summary_lines: list[str], max_add: int = 4) -> list[str]:
if not lines or not tokens or not summary_lines:
return lines
hay = " ".join(lines).lower()
missing = [tok for tok in tokens if tok and tok.lower() not in hay]
if not missing:
return lines
added: list[str] = []
for token in missing:
token_lower = token.lower()
for line in summary_lines:
if token_lower in line.lower() and line not in lines and line not in added:
added.append(line)
break
if len(added) >= max_add:
break
if not added:
return lines
return _merge_fact_lines(added, lines)
def _best_keyword_line(lines: list[str], keywords: list[str]) -> str | None:
if not lines or not keywords:
return None
tokens = _expand_tokens(keywords)
if not tokens:
return None
best = None
best_score = 0
for line in lines:
lower = line.lower()
score = sum(1 for tok in tokens if tok in lower)
if score > best_score:
best_score = score
best = line
return best if best_score > 0 else None
def _line_starting_with(lines: list[str], prefix: str) -> str | None:
if not lines or not prefix:
return None
lower_prefix = prefix.lower()
for line in lines:
if str(line).lower().startswith(lower_prefix):
return line
return None
def _non_rpi_nodes(summary: dict[str, Any]) -> dict[str, list[str]]:
hardware = summary.get("hardware_by_node") if isinstance(summary, dict) else None
if not isinstance(hardware, dict):
return {}
grouped: dict[str, list[str]] = {}
for node, hw in hardware.items():
if not isinstance(node, str) or not isinstance(hw, str):
continue
if hw.startswith("rpi"):
continue
grouped.setdefault(hw, []).append(node)
for nodes in grouped.values():
nodes.sort()
return grouped
def _format_hardware_groups(groups: dict[str, list[str]], label: str) -> str:
if not groups:
return ""
parts = []
for hw, nodes in sorted(groups.items()):
parts.append(f"{hw} ({', '.join(nodes)})")
return f"{label}: " + "; ".join(parts) + "."
def _lexicon_context(summary: dict[str, Any]) -> str: # noqa: C901
if not isinstance(summary, dict):
return ""
lexicon = summary.get("lexicon")
if not isinstance(lexicon, dict):
return ""
terms = lexicon.get("terms")
aliases = lexicon.get("aliases")
lines: list[str] = []
if isinstance(terms, list):
for entry in terms[:8]:
if not isinstance(entry, dict):
continue
term = entry.get("term")
meaning = entry.get("meaning")
if term and meaning:
lines.append(f"{term}: {meaning}")
if isinstance(aliases, dict):
for key, value in list(aliases.items())[:6]:
if key and value:
lines.append(f"alias {key} -> {value}")
if not lines:
return ""
return "Lexicon:\n" + "\n".join(lines)
def _parse_json_block(text: str, *, fallback: dict[str, Any]) -> dict[str, Any]:
raw = text.strip()
match = re.search(r"\{.*\}", raw, flags=re.S)
if match:
return parse_json(match.group(0), fallback=fallback)
return parse_json(raw, fallback=fallback)
def _parse_json_list(text: str) -> list[dict[str, Any]]:
raw = text.strip()
match = re.search(r"\[.*\]", raw, flags=re.S)
data = parse_json(match.group(0), fallback={}) if match else parse_json(raw, fallback={})
if isinstance(data, list):
return [entry for entry in data if isinstance(entry, dict)]
return []
def _scores_from_json(data: dict[str, Any]) -> AnswerScores:
return AnswerScores(
confidence=_coerce_int(data.get("confidence"), 60),
relevance=_coerce_int(data.get("relevance"), 60),
satisfaction=_coerce_int(data.get("satisfaction"), 60),
hallucination_risk=str(data.get("hallucination_risk") or "medium"),
)
def _coerce_int(value: Any, default: int) -> int:
try:
return int(float(value))
except (TypeError, ValueError):
return default
def _default_scores() -> AnswerScores:
return AnswerScores(confidence=60, relevance=60, satisfaction=60, hallucination_risk="medium")
def _style_hint(classify: dict[str, Any]) -> str:
style = (classify.get("answer_style") or "").strip().lower()
qtype = (classify.get("question_type") or "").strip().lower()
if style == "insightful" or qtype in {"open_ended", "planning"}:
return "insightful"
return "direct"
def _needs_evidence_fix(reply: str, classify: dict[str, Any]) -> bool:
if not reply:
return False
lowered = reply.lower()
missing_markers = (
"don't have",
"do not have",
"don't know",
"cannot",
"can't",
"need to",
"would need",
"does not provide",
"does not mention",
"not mention",
"not provided",
"not in context",
"not referenced",
"missing",
"no specific",
"no information",
)
if classify.get("needs_snapshot") and any(marker in lowered for marker in missing_markers):
return True
return classify.get("question_type") in {"metric", "diagnostic"} and not re.search(r"\d", reply)
def _should_use_insight_guard(classify: dict[str, Any]) -> bool:
style = (classify.get("answer_style") or "").strip().lower()
qtype = (classify.get("question_type") or "").strip().lower()
return style == "insightful" or qtype in {"open_ended", "planning"}
async def _apply_insight_guard(inputs: InsightGuardInput) -> str:
if not inputs.reply or not _should_use_insight_guard(inputs.classify):
return inputs.reply
guard_prompt = prompts.INSIGHT_GUARD_PROMPT.format(question=inputs.question, answer=inputs.reply)
guard_raw = await inputs.call_llm(
prompts.INSIGHT_GUARD_SYSTEM,
guard_prompt,
context=inputs.context,
model=inputs.plan.fast_model,
tag="insight_guard",
)
guard = _parse_json_block(guard_raw, fallback={})
if guard.get("ok") is True:
return inputs.reply
fix_prompt = prompts.INSIGHT_FIX_PROMPT.format(question=inputs.question, answer=inputs.reply)
if inputs.facts:
fix_prompt = fix_prompt + "\nFacts:\n" + "\n".join(inputs.facts[:6])
return await inputs.call_llm(
prompts.INSIGHT_FIX_SYSTEM,
fix_prompt,
context=inputs.context,
model=inputs.plan.model,
tag="insight_fix",
)
__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")]

View File

@ -0,0 +1,276 @@
from __future__ import annotations
import difflib
import re
import time
from typing import Any
from ._base import *
def _reply_matches_metric_facts(reply: str, metric_facts: list[str], tokens: list[str] | set[str] | None = None) -> bool:
if not reply or not metric_facts:
return True
reply_numbers = set(re.findall(r"\d+(?:\\.\d+)?", reply))
if not reply_numbers:
return False
fact_numbers: set[str] = set()
value_pattern = re.compile(r"(?:>=|<=|=|:)\s*(\d+(?:\.\d+)?)")
filtered = metric_facts
if tokens:
token_set = {str(tok).lower() for tok in tokens if tok}
focused = []
for line in metric_facts:
key = line.split(":", 1)[0].lower()
if any(tok in key for tok in token_set):
focused.append(line)
if focused:
filtered = focused
for line in filtered:
for match in value_pattern.findall(line):
fact_numbers.add(match)
if not fact_numbers:
return False
return bool(reply_numbers & fact_numbers)
def _needs_dedup(reply: str) -> bool:
if not reply:
return False
sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", reply) if s.strip()]
if len(sentences) < DEDUP_MIN_SENTENCES:
return False
seen = set()
for sent in sentences:
norm = re.sub(r"\s+", " ", sent.lower())
if norm in seen:
return True
seen.add(norm)
return False
def _needs_focus_fix(question: str, reply: str, classify: dict[str, Any]) -> bool:
if not reply:
return False
q_lower = (question or "").lower()
if classify.get("question_type") not in {"metric", "diagnostic"} and not re.search(r"\b(how many|list|count)\b", q_lower):
return False
missing_markers = (
"does not provide",
"does not specify",
"not available",
"not provided",
"cannot determine",
"don't have",
"do not have",
"insufficient",
"no data",
)
if any(marker in reply.lower() for marker in missing_markers):
return True
if reply.count(".") <= 1:
return False
extra_markers = ("for more", "if you need", "additional", "based on")
return any(marker in reply.lower() for marker in extra_markers)
def _extract_keywords(raw_question: str, normalized: str, sub_questions: list[str], keywords: list[Any] | None) -> list[str]:
stopwords = {
"the",
"and",
"for",
"with",
"that",
"this",
"what",
"which",
"when",
"where",
"who",
"why",
"how",
"tell",
"show",
"list",
"give",
"about",
"right",
"now",
}
tokens: list[str] = []
for source in [raw_question, normalized, *sub_questions]:
for part in re.split(r"[^a-zA-Z0-9_-]+", source.lower()):
if len(part) < TOKEN_MIN_LEN or part in stopwords:
continue
tokens.append(part)
if keywords:
for kw in keywords:
if isinstance(kw, str):
part = kw.strip().lower()
if part and part not in stopwords and part not in tokens:
tokens.append(part)
return list(dict.fromkeys(tokens))[:12]
def _allowed_nodes(summary: dict[str, Any]) -> list[str]:
hardware = summary.get("hardware_by_node") if isinstance(summary.get("hardware_by_node"), dict) else {}
if hardware:
return sorted([node for node in hardware if isinstance(node, str)])
return []
def _allowed_namespaces(summary: dict[str, Any]) -> list[str]:
namespaces: list[str] = []
for entry in summary.get("namespace_pods") or []:
if isinstance(entry, dict):
name = entry.get("namespace")
if name:
namespaces.append(str(name))
return sorted(set(namespaces))
def _find_unknown_nodes(reply: str, allowed: list[str]) -> list[str]:
if not reply or not allowed:
return []
pattern = re.compile(r"\b(titan-[0-9a-z]+|node-?\d+)\b", re.IGNORECASE)
found = {m.group(1) for m in pattern.finditer(reply)}
if not found:
return []
allowed_set = {a.lower() for a in allowed}
return sorted({item for item in found if item.lower() not in allowed_set})
def _find_unknown_namespaces(reply: str, allowed: list[str]) -> list[str]:
if not reply or not allowed:
return []
pattern = re.compile(r"\bnamespace\s+([a-z0-9-]+)\b", re.IGNORECASE)
found = {m.group(1) for m in pattern.finditer(reply)}
if not found:
return []
allowed_set = {a.lower() for a in allowed}
return sorted({item for item in found if item.lower() not in allowed_set})
def _needs_runbook_fix(reply: str, allowed: list[str]) -> bool:
if not reply or not allowed:
return False
paths = set(re.findall(r"runbooks/[A-Za-z0-9._-]+", reply))
if not paths:
return False
allowed_set = {p.lower() for p in allowed}
return any(path.lower() not in allowed_set for path in paths)
def _needs_runbook_reference(question: str, allowed: list[str], reply: str) -> bool:
if not allowed or not question:
return False
lowered = question.lower()
cues = ("runbook", "checklist", "documented", "documentation", "where", "guide")
if not any(cue in lowered for cue in cues):
return False
if not reply:
return True
for token in re.findall(r"runbooks/[A-Za-z0-9._-]+", reply):
if token.lower() in {p.lower() for p in allowed}:
return False
return True
def _best_runbook_match(candidate: str, allowed: list[str]) -> str | None:
if not candidate or not allowed:
return None
best = None
best_score = 0.0
for path in allowed:
score = difflib.SequenceMatcher(a=candidate.lower(), b=path.lower()).ratio()
if score > best_score:
best_score = score
best = path
return best if best_score >= RUNBOOK_SIMILARITY_THRESHOLD else None
def _resolve_path(data: Any, path: str) -> Any | None:
if path.startswith("line:"):
return path.split("line:", 1)[1].strip()
cursor = data
for part in re.split(r"\.(?![^\[]*\])", path):
if not part:
continue
match = re.match(r"^(\w+)(?:\[(\d+)\])?$", part)
if not match:
return None
key = match.group(1)
index = match.group(2)
if isinstance(cursor, dict):
cursor = cursor.get(key)
else:
return None
if index is not None:
idx = int(index)
if isinstance(cursor, list) and 0 <= idx < len(cursor):
cursor = cursor[idx]
else:
return None
return cursor
def _snapshot_id(summary: dict[str, Any]) -> str | None:
if not summary:
return None
for key in ("generated_at", "snapshot_ts", "snapshot_id"):
value = summary.get(key)
if isinstance(value, str) and value:
return value
return None
def _claims_to_payload(claims: list[ClaimItem]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for claim in claims:
evidence = []
for ev in claim.evidence:
evidence.append(
{
"path": ev.path,
"reason": ev.reason,
"value_at_claim": ev.value_at_claim,
}
)
output.append({"id": claim.id, "claim": claim.claim, "evidence": evidence})
return output
def _state_from_payload(payload: dict[str, Any] | None) -> ConversationState | None:
if not payload:
return None
claims_raw = payload.get("claims") if isinstance(payload, dict) else None
claims: list[ClaimItem] = []
if isinstance(claims_raw, list):
for entry in claims_raw:
if not isinstance(entry, dict):
continue
claim_text = str(entry.get("claim") or "").strip()
claim_id = str(entry.get("id") or "").strip()
if not claim_text or not claim_id:
continue
evidence_items: list[EvidenceItem] = []
for ev in entry.get("evidence") or []:
if not isinstance(ev, dict):
continue
path = str(ev.get("path") or "").strip()
if not path:
continue
reason = str(ev.get("reason") or "").strip()
value_at_claim = ev.get("value_at_claim")
evidence_items.append(EvidenceItem(path=path, reason=reason, value_at_claim=value_at_claim))
if evidence_items:
claims.append(ClaimItem(id=claim_id, claim=claim_text, evidence=evidence_items))
return ConversationState(
updated_at=float(payload.get("updated_at") or time.monotonic()),
claims=claims,
snapshot_id=payload.get("snapshot_id"),
snapshot=payload.get("snapshot"),
)
__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")]

View File

@ -0,0 +1,344 @@
from __future__ import annotations
import asyncio
import json
import re
from collections.abc import Awaitable
from collections.abc import Callable
from typing import Any
from atlasbot.llm import prompts
from atlasbot.llm.client import parse_json
from ._base import *
from .post_ext import _extract_keywords
def _parse_json_block(text: str, *, fallback: dict[str, Any]) -> dict[str, Any]:
raw = text.strip()
match = re.search(r"\{.*\}", raw, flags=re.S)
if match:
return parse_json(match.group(0), fallback=fallback)
return parse_json(raw, fallback=fallback)
async def _select_metric_chunks(
call_llm: Callable[..., Awaitable[str]],
ctx: dict[str, Any],
chunks: list[dict[str, Any]],
plan: ModePlan,
) -> tuple[list[str], list[str]]:
summary_lines, question, sub_questions, keywords, token_set = _metric_ctx_values(ctx)
if not summary_lines or not chunks:
return [], []
keys = _extract_metric_keys(summary_lines)
if not keys:
return [], []
max_keys = max(4, plan.max_subquestions * 2)
candidate_keys = _filter_metric_keys(keys, token_set)
available_keys = candidate_keys or keys
prompt = prompts.METRIC_KEYS_PROMPT.format(available="\n".join(available_keys), max_keys=max_keys)
raw = await call_llm(
prompts.METRIC_KEYS_SYSTEM,
prompt + "\nQuestion: " + str(question) + "\nSubQuestions:\n" + "\n".join([str(item) for item in sub_questions]),
context="Keywords:\n" + ", ".join([str(item) for item in keywords if item]),
model=plan.fast_model,
tag="metric_keys",
)
selected = _parse_key_list(raw, available_keys, max_keys)
if candidate_keys:
selected = _merge_metric_keys(selected, candidate_keys, max_keys)
if selected and candidate_keys and not _metric_key_overlap(selected, token_set):
selected = candidate_keys[:max_keys]
if not selected and candidate_keys:
selected = candidate_keys[:max_keys]
if available_keys:
missing = await _validate_metric_keys(
call_llm,
{
"question": question,
"sub_questions": sub_questions,
"selected": selected,
},
available_keys,
plan,
)
if missing:
selected = _merge_metric_keys(selected, missing, max_keys)
if not selected:
return [], []
ids = _chunk_ids_for_keys(chunks, selected)
return selected, ids
async def _validate_metric_keys(
call_llm: Callable[..., Awaitable[str]],
ctx: dict[str, Any],
available: list[str],
plan: ModePlan,
) -> list[str]:
if not available:
return []
question = str(ctx.get("question") or "")
sub_questions = ctx.get("sub_questions") if isinstance(ctx.get("sub_questions"), list) else []
selected = ctx.get("selected") if isinstance(ctx.get("selected"), list) else []
cap = max(12, plan.max_subquestions * 4)
available_list = available[:cap]
prompt = prompts.METRIC_KEYS_VALIDATE_PROMPT.format(
question=question,
sub_questions=json.dumps(sub_questions),
selected=json.dumps(selected),
available="\n".join(available_list),
)
raw = await call_llm(
prompts.METRIC_KEYS_VALIDATE_SYSTEM,
prompt,
model=plan.fast_model,
tag="metric_keys_validate",
)
parsed = _parse_json_block(raw, fallback={})
items = parsed.get("missing") if isinstance(parsed, dict) else []
if not isinstance(items, list):
return []
available_set = set(available_list)
out: list[str] = []
for item in items:
if isinstance(item, str) and item in available_set and item not in out:
out.append(item)
return out
async def _gather_limited(coros: list[Awaitable[Any]], limit: int) -> list[Any]:
if not coros:
return []
semaphore = asyncio.Semaphore(max(1, limit))
async def runner(coro: Awaitable[Any]) -> Any:
async with semaphore:
return await coro
return await asyncio.gather(*(runner(coro) for coro in coros))
def _metric_ctx_values(ctx: dict[str, Any]) -> tuple[list[str], str, list[str], list[str], set[str]]:
summary_lines = ctx.get("summary_lines") if isinstance(ctx, dict) else None
if not isinstance(summary_lines, list):
return [], "", [], [], set()
question = ctx.get("question") if isinstance(ctx, dict) else ""
sub_questions = ctx.get("sub_questions") if isinstance(ctx.get("sub_questions"), list) else []
keywords = ctx.get("keywords") if isinstance(ctx.get("keywords"), list) else []
keyword_tokens = ctx.get("keyword_tokens") if isinstance(ctx.get("keyword_tokens"), list) else []
token_set = {str(token).lower() for token in keyword_tokens if token}
token_set |= {token.lower() for token in _extract_keywords(str(question), str(question), sub_questions=sub_questions, keywords=keywords)}
token_set = _token_variants(token_set)
return summary_lines, str(question), sub_questions, keywords, token_set
def _extract_metric_keys(lines: list[str]) -> list[str]:
keys: list[str] = []
for line in lines:
if ":" not in line:
continue
key = line.split(":", 1)[0].strip()
if not key or " " in key:
continue
if key not in keys:
keys.append(key)
return keys
def _token_variants(tokens: set[str]) -> set[str]:
if not tokens:
return set()
variants = set(tokens)
for token in list(tokens):
if len(token) <= TOKEN_MIN_LEN:
continue
if token.endswith("ies") and len(token) > TOKEN_MIN_LEN:
variants.add(token[:-3] + "y")
if token.endswith("es") and len(token) > TOKEN_MIN_LEN:
variants.add(token[:-2])
if token.endswith("s") and len(token) > TOKEN_MIN_LEN:
variants.add(token[:-1])
return variants
def _parse_key_list(raw: str, allowed: list[str], max_keys: int) -> list[str]:
parsed = _parse_json_block(raw, fallback={})
if isinstance(parsed, list):
items = parsed
else:
items = parsed.get("keys") if isinstance(parsed, dict) else []
if not isinstance(items, list):
return []
allowed_set = set(allowed)
out: list[str] = []
for item in items:
if not isinstance(item, str):
continue
if item in allowed_set and item not in out:
out.append(item)
if len(out) >= max_keys:
break
return out
def _chunk_ids_for_keys(chunks: list[dict[str, Any]], keys: list[str]) -> list[str]:
if not keys:
return []
ids: list[str] = []
key_set = {f"{key}:" for key in keys}
for chunk in chunks:
text = str(chunk.get("text") or "")
if not text:
continue
for line in text.splitlines():
for key in key_set:
if line.startswith(key):
cid = chunk.get("id")
if cid and cid not in ids:
ids.append(cid)
break
return ids
def _filter_metric_keys(keys: list[str], tokens: set[str]) -> list[str]:
if not keys or not tokens:
return []
lowered_tokens = {token.lower() for token in tokens if token and len(token) >= TOKEN_MIN_LEN}
ranked: list[tuple[int, str]] = []
for key in keys:
parts = [part for part in re.split(r"[_\W]+", key.lower()) if part]
if not parts:
continue
hits = len(set(parts) & lowered_tokens)
if hits:
ranked.append((hits, key))
ranked.sort(key=lambda item: (-item[0], item[1]))
return [item[1] for item in ranked]
def _metric_key_overlap(keys: list[str], tokens: set[str]) -> bool:
if not keys or not tokens:
return False
lowered_tokens = {token.lower() for token in tokens if token and len(token) >= TOKEN_MIN_LEN}
for key in keys:
parts = [part for part in re.split(r"[_\W]+", key.lower()) if part]
if set(parts) & lowered_tokens:
return True
return False
def _lines_for_metric_keys(lines: list[str], keys: list[str], max_lines: int = 0) -> list[str]:
if not lines or not keys:
return []
prefixes = {f"{key}:" for key in keys}
selected: list[str] = []
for line in lines:
for prefix in prefixes:
if prefix in line:
selected.append(line)
break
if max_lines and len(selected) >= max_lines:
break
return selected
def _merge_metric_keys(current: list[str], candidates: list[str], max_keys: int) -> list[str]:
merged: list[str] = []
seen = set()
for key in current:
if key and key not in seen:
merged.append(key)
seen.add(key)
for key in candidates:
if key and key not in seen:
merged.append(key)
seen.add(key)
if len(merged) >= max_keys:
break
return merged[:max_keys]
def _merge_fact_lines(primary: list[str], fallback: list[str]) -> list[str]:
seen = set()
merged: list[str] = []
for line in primary + fallback:
if line in seen:
continue
seen.add(line)
merged.append(line)
return merged
def _expand_hottest_line(line: str) -> list[str]:
if not line:
return []
if not line.lower().startswith("hottest:"):
return []
expanded: list[str] = []
payload = line.split("hottest:", 1)[1]
for part in payload.split(";"):
part = part.strip()
if not part or "=" not in part:
continue
metric, rest = part.split("=", 1)
metric = metric.strip()
match = re.search(r"(?P<node>[^\s\[]+).*\((?P<value>[^)]+)\)", rest)
if not match:
continue
node = match.group("node").strip()
value = match.group("value").strip()
class_match = re.search(r"\[(?P<class>[^\]]+)\]", rest)
node_class = class_match.group("class").strip() if class_match else ""
if node_class:
expanded.append(f"hottest_{metric}_node: {node} [{node_class}] ({value})")
else:
expanded.append(f"hottest_{metric}_node: {node} ({value})")
return expanded
def _has_token(text: str, token: str) -> bool:
if not text or not token:
return False
if token == "io":
return "i/o" in text or re.search(r"\bio\b", text) is not None
return re.search(rf"\b{re.escape(token)}\b", text) is not None
def _hotspot_evidence(summary: dict[str, Any]) -> list[str]:
hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {}
if not hottest:
return []
hardware_by_node = summary.get("hardware_by_node") if isinstance(summary.get("hardware_by_node"), dict) else {}
node_pods_top = summary.get("node_pods_top") if isinstance(summary.get("node_pods_top"), list) else []
ns_map = {}
for item in node_pods_top:
if not isinstance(item, dict):
continue
node = item.get("node")
namespaces_top = item.get("namespaces_top") if isinstance(item.get("namespaces_top"), list) else []
ns_map[node] = namespaces_top
lines: list[str] = []
for metric, info in hottest.items():
if not isinstance(info, dict):
continue
node = info.get("node")
value = info.get("value")
if not node:
continue
node_class = hardware_by_node.get(node)
ns_parts = []
for entry in ns_map.get(node, [])[:3]:
if isinstance(entry, (list, tuple)) and len(entry) >= NS_ENTRY_MIN_LEN:
ns_parts.append(f"{entry[0]}={entry[1]}")
ns_text = ", ".join(ns_parts)
value_text = f"{value:.2f}" if isinstance(value, (int, float)) else str(value)
line = f"hotspot.{metric}: node={node} class={node_class or 'unknown'} value={value_text}"
if ns_text:
line += f" namespaces_top={ns_text}"
lines.append(line)
return lines
__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")]

View File

@ -0,0 +1,197 @@
from __future__ import annotations
import re
from collections.abc import Callable
from typing import Any
from atlasbot.llm import prompts
from atlasbot.llm.client import parse_json
from ._base import *
def _parse_json_block(text: str, *, fallback: dict[str, Any]) -> dict[str, Any]:
raw = text.strip()
match = re.search(r"\{.*\}", raw, flags=re.S)
if match:
return parse_json(match.group(0), fallback=fallback)
return parse_json(raw, fallback=fallback)
def _metric_key_tokens(summary_lines: list[str]) -> set[str]:
tokens: set[str] = set()
for line in summary_lines:
if not isinstance(line, str) or ":" not in line:
continue
key = line.split(":", 1)[0].strip().lower()
if not key:
continue
tokens.add(key)
for part in re.split(r"[_\s]+", key):
if part:
tokens.add(part)
return tokens
async def _select_best_candidate(call_llm: Callable[..., Any], question: str, candidates: list[str], plan: ModePlan, tag: str) -> int:
if len(candidates) <= 1:
return 0
prompt = (
prompts.CANDIDATE_SELECT_PROMPT
+ "\nQuestion: "
+ question
+ "\nCandidates:\n"
+ "\n".join([f"{idx+1}) {cand}" for idx, cand in enumerate(candidates)])
)
raw = await call_llm(prompts.CANDIDATE_SELECT_SYSTEM, prompt, model=plan.model, tag=tag)
data = _parse_json_block(raw, fallback={})
best = data.get("best") if isinstance(data, dict) else None
if isinstance(best, int) and 1 <= best <= len(candidates):
return best - 1
return 0
def _dedupe_lines(lines: list[str], limit: int | None = None) -> list[str]:
seen: set[str] = set()
cleaned: list[str] = []
for line in lines:
value = (line or "").strip()
if not value or value in seen:
continue
if value.lower().startswith("lexicon_") or value.lower().startswith("units:"):
continue
cleaned.append(value)
seen.add(value)
if limit and len(cleaned) >= limit:
break
return cleaned
def _collect_fact_candidates(selected: list[dict[str, Any]], limit: int) -> list[str]:
lines: list[str] = []
for chunk in selected:
text = chunk.get("text") if isinstance(chunk, dict) else None
if not isinstance(text, str):
continue
lines.extend([line for line in text.splitlines() if line.strip()])
return _dedupe_lines(lines, limit=limit)
async def _select_best_list(call_llm: Callable[..., Any], question: str, candidates: list[list[str]], plan: ModePlan, tag: str) -> list[str]:
if not candidates:
return []
if len(candidates) == 1:
return candidates[0]
render = ["; ".join(items) for items in candidates]
best_idx = await _select_best_candidate(call_llm, question, render, plan, tag)
chosen = candidates[best_idx] if 0 <= best_idx < len(candidates) else candidates[0]
if not chosen:
merged: list[str] = []
for entry in candidates:
for item in entry:
if item not in merged:
merged.append(item)
chosen = merged
return chosen
async def _extract_fact_types(call_llm: Callable[..., Any], question: str, keywords: list[str], plan: ModePlan) -> list[str]:
prompt = prompts.FACT_TYPES_PROMPT + "\nQuestion: " + question
if keywords:
prompt += "\nKeywords: " + ", ".join(keywords)
candidates: list[list[str]] = []
attempts = max(plan.metric_retries, 1)
for _ in range(attempts):
raw = await call_llm(prompts.FACT_TYPES_SYSTEM, prompt, model=plan.fast_model, tag="fact_types")
data = _parse_json_block(raw, fallback={})
items = data.get("fact_types") if isinstance(data, dict) else None
if not isinstance(items, list):
continue
cleaned = _dedupe_lines([str(item) for item in items if isinstance(item, (str, int, float))], limit=10)
if cleaned:
candidates.append(cleaned)
chosen = await _select_best_list(call_llm, question, candidates, plan, "fact_types_select")
return chosen[:10]
async def _derive_signals(call_llm: Callable[..., Any], question: str, fact_types: list[str], plan: ModePlan) -> list[str]:
if not fact_types:
return []
prompt = prompts.SIGNAL_PROMPT.format(question=question, fact_types="; ".join(fact_types))
candidates: list[list[str]] = []
attempts = max(plan.metric_retries, 1)
for _ in range(attempts):
raw = await call_llm(prompts.SIGNAL_SYSTEM, prompt, model=plan.fast_model, tag="signals")
data = _parse_json_block(raw, fallback={})
items = data.get("signals") if isinstance(data, dict) else None
if not isinstance(items, list):
continue
cleaned = _dedupe_lines([str(item) for item in items if isinstance(item, (str, int, float))], limit=12)
if cleaned:
candidates.append(cleaned)
chosen = await _select_best_list(call_llm, question, candidates, plan, "signals_select")
return chosen[:12]
async def _scan_chunk_for_signals(call_llm: Callable[..., Any], question: str, signals: list[str], chunk_lines: list[str], plan: ModePlan) -> list[str]:
if not signals or not chunk_lines:
return []
prompt = prompts.CHUNK_SCAN_PROMPT.format(
signals="; ".join(signals),
lines="\n".join(chunk_lines),
)
attempts = max(1, min(plan.metric_retries, 2))
candidates: list[list[str]] = []
for _ in range(attempts):
raw = await call_llm(prompts.CHUNK_SCAN_SYSTEM, prompt, model=plan.fast_model, tag="chunk_scan")
data = _parse_json_block(raw, fallback={})
items = data.get("lines") if isinstance(data, dict) else None
if not isinstance(items, list):
continue
cleaned = [line for line in chunk_lines if line in items]
cleaned = _dedupe_lines(cleaned, limit=15)
if cleaned:
candidates.append(cleaned)
chosen = await _select_best_list(call_llm, question, candidates, plan, "chunk_scan_select")
return chosen[:15]
async def _prune_metric_candidates(call_llm: Callable[..., Any], question: str, candidates: list[str], plan: ModePlan, attempts: int) -> list[str]:
if not candidates:
return []
prompt = prompts.FACT_PRUNE_PROMPT.format(question=question, candidates="\n".join(candidates), max_lines=6)
picks: list[list[str]] = []
for _ in range(max(attempts, 1)):
raw = await call_llm(prompts.FACT_PRUNE_SYSTEM, prompt, model=plan.fast_model, tag="fact_prune")
data = _parse_json_block(raw, fallback={})
items = data.get("lines") if isinstance(data, dict) else None
if not isinstance(items, list):
continue
cleaned = [line for line in candidates if line in items]
cleaned = _dedupe_lines(cleaned, limit=6)
if cleaned:
picks.append(cleaned)
chosen = await _select_best_list(call_llm, question, picks, plan, "fact_prune_select")
return chosen[:6]
async def _select_fact_lines(call_llm: Callable[..., Any], question: str, candidates: list[str], plan: ModePlan, max_lines: int) -> list[str]:
if not candidates:
return []
prompt = prompts.FACT_PRUNE_PROMPT.format(question=question, candidates="\n".join(candidates), max_lines=max_lines)
picks: list[list[str]] = []
attempts = max(plan.metric_retries, 1)
for _ in range(attempts):
raw = await call_llm(prompts.FACT_PRUNE_SYSTEM, prompt, model=plan.fast_model, tag="fact_select")
data = _parse_json_block(raw, fallback={})
items = data.get("lines") if isinstance(data, dict) else None
if not isinstance(items, list):
continue
cleaned = [line for line in candidates if line in items]
cleaned = _dedupe_lines(cleaned, limit=max_lines)
if cleaned:
picks.append(cleaned)
chosen = await _select_best_list(call_llm, question, picks, plan, "fact_select_best")
return chosen[:max_lines]
__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")]

View File

@ -0,0 +1,404 @@
from __future__ import annotations
import re
from typing import Any
from atlasbot.engine.intent_router import IntentMatch
from atlasbot.snapshot.builder import summary_text
from ._base import *
def _join_context(parts: list[str]) -> str:
text = "\n".join([part for part in parts if part])
return text.strip()
def _format_metric_value(value: Any) -> str:
if isinstance(value, bool):
return str(value).lower()
if isinstance(value, int):
return str(value)
if isinstance(value, float):
return f"{value:.1f}".rstrip("0").rstrip(".")
return str(value)
def _format_history(history: list[dict[str, str]] | None) -> str:
if not history:
return ""
lines = ["Recent conversation (non-authoritative):"]
for entry in history[-4:]:
if not isinstance(entry, dict):
continue
question = entry.get("q")
answer = entry.get("a")
role = entry.get("role")
content = entry.get("content")
if question:
lines.append(f"Q: {question}")
if answer:
lines.append(f"A: {answer}")
if role and content:
prefix = "Q" if role == "user" else "A"
lines.append(f"{prefix}: {content}")
return "\n".join(lines)
def _summary_lines(snapshot: dict[str, Any] | None) -> list[str]:
text = summary_text(snapshot)
if not text:
return []
return [line for line in text.splitlines() if line.strip()]
def _line_starting_with(lines: list[str], prefix: str) -> str | None:
if not lines:
return None
for line in lines:
if line.lower().startswith(prefix.lower()):
return line
return None
def _spine_lines(lines: list[str]) -> dict[str, str]:
spine: dict[str, str] = {}
_spine_nodes(lines, spine)
_spine_hardware(lines, spine)
_spine_hottest(lines, spine)
_spine_postgres(lines, spine)
_spine_namespaces(lines, spine)
_spine_pressure(lines, spine)
return spine
def _spine_nodes(lines: list[str], spine: dict[str, str]) -> None:
nodes_line = _line_starting_with(lines, "nodes:")
if nodes_line:
spine["nodes_count"] = nodes_line
spine["nodes_ready"] = nodes_line
return
nodes_total = _line_starting_with(lines, "nodes_total:")
nodes_ready = _line_starting_with(lines, "nodes_ready:")
if nodes_total:
spine["nodes_count"] = nodes_total
if nodes_ready:
spine["nodes_ready"] = nodes_ready
def _spine_hardware(lines: list[str], spine: dict[str, str]) -> None:
hardware_line = _line_starting_with(lines, "hardware_nodes:")
if not hardware_line:
hardware_line = _line_starting_with(lines, "hardware:")
if hardware_line:
spine["nodes_non_rpi"] = hardware_line
def _spine_hottest(lines: list[str], spine: dict[str, str]) -> None:
hottest_line = _line_starting_with(lines, "hottest:")
if not hottest_line:
return
for key in ("hottest_cpu", "hottest_ram", "hottest_net", "hottest_io", "hottest_disk"):
spine[key] = hottest_line
def _spine_postgres(lines: list[str], spine: dict[str, str]) -> None:
postgres_total = _line_starting_with(lines, "postgres_connections_total:")
if postgres_total:
spine["postgres_connections"] = postgres_total
postgres_line = _line_starting_with(lines, "postgres:")
if postgres_line:
spine["postgres_hottest"] = postgres_line
def _spine_namespaces(lines: list[str], spine: dict[str, str]) -> None:
namespaces_top = _line_starting_with(lines, "namespaces_top:")
if namespaces_top:
spine["namespace_most_pods"] = namespaces_top
def _spine_pressure(lines: list[str], spine: dict[str, str]) -> None:
pressure_line = _line_starting_with(lines, "pressure_nodes:")
if pressure_line:
spine["pressure_summary"] = pressure_line
return
load_line = _line_starting_with(lines, "node_load_top:")
if load_line:
spine["pressure_summary"] = load_line
def _parse_group_line(line: str) -> dict[str, list[str]]:
groups: dict[str, list[str]] = {}
if not line:
return groups
payload = line.split(":", 1)[1] if ":" in line else line
for part in payload.split(";"):
part = part.strip()
if not part or "=" not in part:
continue
key, value = part.split("=", 1)
value = value.strip()
nodes: list[str] = []
if "(" in value and ")" in value:
inner = value[value.find("(") + 1 : value.rfind(")")]
nodes = [item.strip() for item in inner.split(",") if item.strip()]
if not nodes:
cleaned = re.sub(r"^[0-9]+", "", value).strip()
nodes = [item.strip() for item in cleaned.split(",") if item.strip()]
groups[key.strip()] = nodes
return groups
def _parse_hottest(line: str, metric: str) -> str | None:
if not line:
return None
payload = line.split(":", 1)[1] if ":" in line else line
for part in payload.split(";"):
part = part.strip()
if part.startswith(f"{metric}="):
return part
return None
def _spine_answer(intent: IntentMatch, spine_line: str | None) -> str | None:
if not spine_line:
return None
handlers = {
"nodes_count": _spine_nodes_answer,
"nodes_ready": _spine_nodes_answer,
"nodes_non_rpi": _spine_non_rpi_answer,
"hardware_mix": _spine_hardware_answer,
"postgres_connections": _spine_postgres_answer,
"postgres_hottest": _spine_postgres_answer,
"namespace_most_pods": _spine_namespace_answer,
"pressure_summary": _spine_pressure_answer,
}
kind = intent.kind
if kind.startswith("hottest_"):
return _spine_hottest_answer(kind, spine_line)
handler = handlers.get(kind)
if handler:
return handler(spine_line)
return spine_line
def _spine_nodes_answer(line: str) -> str:
return line
def _spine_non_rpi_answer(line: str) -> str:
groups = _parse_group_line(line)
non_rpi: list[str] = []
for key, nodes in groups.items():
if key.lower().startswith("rpi"):
continue
non_rpi.extend(nodes)
if non_rpi:
return "Non-Raspberry Pi nodes: " + ", ".join(non_rpi) + "."
return line
def _spine_hardware_answer(line: str) -> str:
return line
def _spine_hottest_answer(kind: str, line: str) -> str:
metric = kind.split("_", 1)[1]
hottest = _parse_hottest(line, metric)
if hottest:
return hottest
return line
def _spine_postgres_answer(line: str) -> str:
return line
def _spine_namespace_answer(line: str) -> str:
payload = line.split(":", 1)[1] if ":" in line else line
top = payload.split(";")[0].strip()
if top:
return f"Namespace with most pods: {top}."
return line
def _spine_pressure_answer(line: str) -> str:
return line
def _spine_from_summary(summary: dict[str, Any]) -> dict[str, str]:
if not isinstance(summary, dict) or not summary:
return {}
spine: dict[str, str] = {}
spine.update(_spine_from_counts(summary))
spine.update(_spine_from_hardware(summary))
spine.update(_spine_from_hottest(summary))
spine.update(_spine_from_postgres(summary))
spine.update(_spine_from_namespace_pods(summary))
spine.update(_spine_from_pressure(summary))
return spine
def _spine_from_counts(summary: dict[str, Any]) -> dict[str, str]:
counts = summary.get("counts") if isinstance(summary.get("counts"), dict) else {}
inventory = summary.get("inventory") if isinstance(summary.get("inventory"), dict) else {}
nodes = summary.get("nodes") if isinstance(summary.get("nodes"), dict) else {}
workers = inventory.get("workers") if isinstance(inventory.get("workers"), dict) else {}
total = nodes.get("total")
ready = nodes.get("ready")
not_ready = nodes.get("not_ready")
if total is None:
total = counts.get("nodes_total")
if ready is None:
ready = counts.get("nodes_ready")
if not_ready is None and isinstance(inventory.get("not_ready_names"), list):
not_ready = len(inventory.get("not_ready_names") or [])
workers_ready = workers.get("ready")
workers_total = workers.get("total")
if total is None and ready is None and not_ready is None:
return {}
parts = []
if total is not None:
parts.append(f"total={int(total)}")
if ready is not None:
parts.append(f"ready={int(ready)}")
if not_ready is not None:
parts.append(f"not_ready={int(not_ready)}")
if workers_total is not None and workers_ready is not None:
parts.append(f"workers_ready={int(workers_ready)}/{int(workers_total)}")
line = "nodes: " + ", ".join(parts)
return {"nodes_count": line, "nodes_ready": line}
def _spine_from_hardware(summary: dict[str, Any]) -> dict[str, str]:
hardware = summary.get("hardware") if isinstance(summary.get("hardware"), dict) else {}
if not hardware:
return {}
parts = []
for key, nodes in hardware.items():
if not isinstance(nodes, list):
continue
node_list = ", ".join(str(n) for n in nodes if n)
if node_list:
parts.append(f"{key}=({node_list})")
if not parts:
return {}
return {"nodes_non_rpi": "hardware: " + "; ".join(parts)}
def _spine_from_hottest(summary: dict[str, Any]) -> dict[str, str]:
hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {}
top = summary.get("top") if isinstance(summary.get("top"), dict) else {}
top_hottest = top.get("node_hottest") if isinstance(top.get("node_hottest"), dict) else {}
if not hottest and top_hottest:
hottest = top_hottest
elif top_hottest:
for key, value in top_hottest.items():
if key not in hottest and value is not None:
hottest[key] = value
if not hottest:
return {}
mapping = {}
for key in ("cpu", "ram", "net", "io", "disk"):
entry = hottest.get(key)
if not isinstance(entry, dict):
continue
node = entry.get("node") or entry.get("label") or ""
value = entry.get("value")
if node:
mapping[f"hottest_{key}"] = f"{key}={node} ({_format_metric_value(value)})"
if not mapping:
return {}
return mapping
def _spine_from_postgres(summary: dict[str, Any]) -> dict[str, str]:
postgres = summary.get("postgres") if isinstance(summary.get("postgres"), dict) else {}
if not postgres:
top = summary.get("top") if isinstance(summary.get("top"), dict) else {}
postgres = top.get("postgres") if isinstance(top.get("postgres"), dict) else {}
if not postgres:
return {}
used = postgres.get("used")
max_conn = postgres.get("max")
hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {}
hottest_label = hottest.get("label") or ""
facts: dict[str, str] = {}
if used is not None and max_conn is not None:
facts["postgres_connections"] = f"postgres_connections_total: used={int(used)}, max={int(max_conn)}"
if hottest_label:
facts["postgres_hottest"] = f"postgres_hottest_db: {hottest_label}"
return facts
def _spine_from_namespace_pods(summary: dict[str, Any]) -> dict[str, str]:
pods = summary.get("namespace_pods") if isinstance(summary.get("namespace_pods"), list) else []
if not pods:
top = summary.get("top") if isinstance(summary.get("top"), dict) else {}
pods = top.get("namespace_pods") if isinstance(top.get("namespace_pods"), list) else []
if not pods:
return {}
best_name = ""
best_value = None
for entry in pods:
if not isinstance(entry, dict):
continue
name = entry.get("namespace") or entry.get("name") or entry.get("label") or ""
value = entry.get("pods")
if value is None:
value = entry.get("pods_total")
if value is None:
value = entry.get("value")
try:
numeric = float(value)
except (TypeError, ValueError):
numeric = None
if name and numeric is not None and (best_value is None or numeric > best_value):
best_name = name
best_value = numeric
if best_name:
return {"namespace_most_pods": f"namespace_most_pods: {best_name} ({int(best_value or 0)} pods)"}
return {}
def _spine_from_pressure(summary: dict[str, Any]) -> dict[str, str]:
pressure = summary.get("pressure_summary") if isinstance(summary.get("pressure_summary"), dict) else {}
if not pressure:
pressure = summary.get("pressure_nodes") if isinstance(summary.get("pressure_nodes"), dict) else {}
if not pressure:
return {}
total = pressure.get("total")
unsched = pressure.get("unschedulable")
names = pressure.get("names") if isinstance(pressure.get("names"), list) else []
parts = []
if total is None and names:
total = len([name for name in names if name])
if total is not None:
parts.append(f"total={int(total)}")
if unsched is not None:
parts.append(f"unschedulable={int(unsched)}")
if parts:
return {"pressure_summary": "pressure_nodes: " + ", ".join(parts)}
return {}
def _spine_fallback(intent: IntentMatch, lines: list[str]) -> str | None:
if not lines:
return None
keywords = {
"nodes_count": ("nodes:", "nodes_total:"),
"nodes_ready": ("nodes:", "nodes_ready:"),
"postgres_hottest": ("postgres_hottest", "hottest_db", "postgres"),
"namespace_most_pods": ("namespace", "pods", "namespaces_top"),
"pressure_summary": ("pressure", "node_load_top"),
}
for token in keywords.get(intent.kind, ("",)):
if not token:
continue
for line in lines:
if token in line:
return line
return None
__all__ = [name for name in globals() if name.startswith("_") and not name.startswith("__")]

View File

@ -0,0 +1,484 @@
from __future__ import annotations
import asyncio
import json
import math
import re
import time
from collections.abc import Callable
from typing import Any
from atlasbot.engine.intent_router import route_intent
from atlasbot.llm import prompts
from atlasbot.llm.client import build_messages
from atlasbot.snapshot.builder import build_summary
from ._base import *
from .common import *
from .factsheet import *
from .post import *
from .post_ext import *
from .retrieval import *
from .retrieval_ext import *
from .spine import *
from .workflow_post import finalize_answer
async def run_answer(engine: Any, question: str, *, mode: str, history: list[dict[str, str]] | None = None, observer: Callable[[str, str], None] | None = None, conversation_id: str | None = None, snapshot_pin: bool | None = None) -> AnswerResult: # noqa: C901
"""Answer a question using the staged reasoning pipeline."""
settings = engine._settings
question = (question or "").strip()
if not question:
return AnswerResult("I need a question to answer.", _default_scores(), {"mode": mode})
if mode == "stock":
return await engine._answer_stock(question)
limitless = "run limitless" in question.lower()
if limitless:
question = re.sub(r"(?i)run limitless", "", question).strip()
plan = _mode_plan(settings, mode)
call_limit = _llm_call_limit(settings, mode)
call_cap = math.ceil(call_limit * settings.llm_limit_multiplier)
call_count = 0
limit_hit = False
time_budget_hit = False
started = time.monotonic()
time_budget_sec = _mode_time_budget(settings, mode) if not limitless else 0.0
debug_tags = {
"route",
"decompose",
"chunk_score",
"chunk_select",
"fact_select",
"synth",
"subanswer",
"tool",
"followup",
"select_claims",
"evidence_fix",
}
async def call_llm(system: str, prompt: str, *, context: str | None = None, model: str | None = None, tag: str = "") -> str:
nonlocal call_count, limit_hit, time_budget_hit
if not limitless and call_count >= call_cap:
limit_hit = True
raise LLMLimitReached("llm_limit")
timeout_sec = None
if not limitless and time_budget_sec > 0:
time_left = time_budget_sec - (time.monotonic() - started)
if time_left <= 0:
time_budget_hit = True
raise LLMTimeBudgetExceeded("time_budget")
timeout_sec = min(settings.ollama_timeout_sec, time_left)
call_count += 1
messages = build_messages(system, prompt, context=context)
try:
llm_call = engine._llm.chat(messages, model=model or plan.model, timeout_sec=timeout_sec)
if timeout_sec is not None:
response = await asyncio.wait_for(llm_call, timeout=max(0.001, timeout_sec))
else:
response = await llm_call
except TimeoutError as exc:
time_budget_hit = True
raise LLMTimeBudgetExceeded("time_budget") from exc
log.info(
"atlasbot_llm_call",
extra={"extra": {"mode": mode, "tag": tag, "call": call_count, "limit": call_cap}},
)
if settings.debug_pipeline and tag in debug_tags:
_debug_pipeline_log(settings, f"llm_raw_{tag}", str(response)[:1200])
return response
state = engine._get_state(conversation_id)
pin_snapshot = bool(snapshot_pin) or settings.snapshot_pin_enabled
snapshot = engine._snapshot.get()
snapshot_used = state.snapshot if pin_snapshot and state and state.snapshot else snapshot
summary = build_summary(snapshot_used)
summary_lines = _summary_lines(snapshot_used)
allowed_nodes = _allowed_nodes(summary)
allowed_namespaces = _allowed_namespaces(summary)
spine = _spine_from_summary(summary) or _spine_lines(summary_lines)
metric_tokens = _metric_key_tokens(summary_lines)
global_facts = _global_facts(summary_lines)
kb_summary = engine._kb.summary()
runbooks = engine._kb.runbook_titles(limit=6)
runbook_paths = engine._kb.runbook_paths(limit=10)
history_ctx = _format_history(history)
lexicon_ctx = _lexicon_context(summary)
key_facts: list[str] = []
metric_facts: list[str] = []
facts_used: list[str] = []
reply = ""
scores = _default_scores()
claims: list[ClaimItem] = []
classify: dict[str, Any] = {}
tool_hint: dict[str, Any] | None = None
try:
if mode in {"quick", "fast", "smart", "genius"} and not limitless:
if observer:
observer("factsheet", "building fact sheet")
if _is_plain_math_question(question):
reply = (
"I focus on Titan cluster operations. Ask me about cluster health, nodes, workloads, "
"namespaces, storage, or alerts."
)
return AnswerResult(reply, _default_scores(), _build_meta(mode, call_count, call_cap, limit_hit, time_budget_hit, time_budget_sec, classify, tool_hint, started))
kb_lines = (
engine._kb.chunk_lines(max_files=plan.kb_max_files, max_chars=_factsheet_kb_chars(mode, plan.kb_max_chars))
if engine._kb
else []
)
fact_lines = _quick_fact_sheet_lines(question, summary_lines, kb_lines, limit=_factsheet_line_limit(mode))
classify = {
"needs_snapshot": True,
"needs_kb": bool(kb_lines),
"question_type": f"{mode}_factsheet",
"answer_style": "direct" if mode in {"quick", "fast"} else "concise",
"follow_up": False,
}
heuristic_reply = _quick_fact_sheet_heuristic_answer(question, fact_lines)
if heuristic_reply:
return AnswerResult(heuristic_reply, _default_scores(), _build_meta(mode, call_count, call_cap, limit_hit, time_budget_hit, time_budget_sec, classify, tool_hint, started))
if observer:
observer("quick", "answering from fact sheet")
quick_context = _quick_fact_sheet_text(fact_lines)
quick_prompt = "Question: " + question + "\nAnswer using only the Fact Sheet. " + _factsheet_instruction(mode)
reply = await call_llm(prompts.ANSWER_SYSTEM, quick_prompt, context=quick_context, model=_factsheet_model(mode, plan), tag=f"{mode}_factsheet")
reply = _strip_followup_meta(reply)
return AnswerResult(reply, _default_scores(), _build_meta(mode, call_count, call_cap, limit_hit, time_budget_hit, time_budget_sec, classify, tool_hint, started))
if observer:
observer("normalize", "normalizing")
normalize_prompt = prompts.NORMALIZE_PROMPT + "\nQuestion: " + question
normalize_raw = await call_llm(prompts.NORMALIZE_SYSTEM, normalize_prompt, context=lexicon_ctx, model=plan.fast_model, tag="normalize")
normalize = _parse_json_block(normalize_raw, fallback={"normalized": question, "keywords": []})
normalized = str(normalize.get("normalized") or question).strip() or question
keywords = normalize.get("keywords") or []
_debug_pipeline_log(settings, "normalize_parsed", {"normalized": normalized, "keywords": keywords})
keyword_tokens = _extract_keywords(question, normalized, sub_questions=[], keywords=keywords)
question_tokens = _extract_question_tokens(normalized)
if observer:
observer("route", "routing")
route_prompt = prompts.ROUTE_PROMPT + "\nQuestion: " + normalized + "\nKeywords: " + json.dumps(keywords)
route_raw = await call_llm(prompts.ROUTE_SYSTEM, route_prompt, context=_join_context([kb_summary, lexicon_ctx]), model=plan.fast_model, tag="route")
classify = _parse_json_block(route_raw, fallback={})
classify.setdefault("needs_snapshot", True)
classify.setdefault("answer_style", "direct")
classify.setdefault("follow_up", False)
classify.setdefault("focus_entity", "unknown")
classify.setdefault("focus_metric", "unknown")
if metric_tokens and keyword_tokens and any(token in metric_tokens for token in keyword_tokens):
classify["needs_snapshot"] = True
intent = route_intent(normalized)
if intent:
classify["needs_snapshot"] = True
classify["question_type"] = "metric"
_debug_pipeline_log(settings, "route_parsed", {"classify": classify, "normalized": normalized})
lowered_question = f"{question} {normalized}".lower()
force_metric = bool(re.search(r"\bhow many\b|\bcount\b|\btotal\b", lowered_question))
if any(term in lowered_question for term in ("postgres", "connections", "pvc", "ready")):
force_metric = True
if intent:
spine_line = spine.get(intent.kind) if isinstance(spine, dict) else None
if not spine_line:
spine_line = _spine_fallback(intent, summary_lines)
spine_answer = _spine_answer(intent, spine_line)
if spine_line:
key_facts = _merge_fact_lines([spine_line], key_facts)
metric_facts = _merge_fact_lines([spine_line], metric_facts)
if spine_answer and mode in {"fast", "quick"}:
return AnswerResult(spine_answer, _default_scores(), _build_meta(mode, call_count, call_cap, limit_hit, time_budget_hit, time_budget_sec, classify, tool_hint, started))
cluster_terms = (
"atlas",
"cluster",
"node",
"nodes",
"namespace",
"pod",
"workload",
"k8s",
"kubernetes",
"postgres",
"database",
"db",
"connections",
"cpu",
"ram",
"memory",
"network",
"io",
"disk",
"pvc",
"storage",
)
has_cluster_terms = any(term in lowered_question for term in cluster_terms)
if has_cluster_terms:
classify["needs_snapshot"] = True
lowered_norm = normalized.lower()
if ("namespace" in lowered_norm and ("pod" in lowered_norm or "pods" in lowered_norm)) or re.search(r"\bmost\s+pods\b", lowered_norm) or re.search(r"\bpods\s+running\b", lowered_norm):
classify["question_type"] = "metric"
classify["needs_snapshot"] = True
if re.search(r"\b(how many|count|number of|list)\b", lowered_question):
classify["question_type"] = "metric"
if any(term in lowered_question for term in ("postgres", "connections", "db")):
classify["question_type"] = "metric"
classify["needs_snapshot"] = True
if any(term in lowered_question for term in ("pvc", "persistentvolume", "persistent volume", "storage")):
if classify.get("question_type") not in {"metric", "diagnostic"}:
classify["question_type"] = "metric"
classify["needs_snapshot"] = True
if "ready" in lowered_question and classify.get("question_type") not in {"metric", "diagnostic"}:
classify["question_type"] = "diagnostic"
hottest_terms = ("hottest", "highest", "lowest", "most")
metric_terms = ("cpu", "ram", "memory", "net", "network", "io", "disk", "load", "usage", "pod", "pods", "namespace")
if any(term in lowered_question for term in hottest_terms) and any(term in lowered_question for term in metric_terms):
classify["question_type"] = "metric"
baseline_terms = ("baseline", "delta", "trend", "increase", "decrease", "drop", "spike", "regression", "change")
if any(term in lowered_question for term in baseline_terms) and any(term in lowered_question for term in metric_terms):
classify["question_type"] = "metric"
classify["needs_snapshot"] = True
if not classify.get("follow_up") and state and state.claims:
follow_terms = ("there", "that", "those", "these", "it", "them", "that one", "this", "former", "latter")
is_metric_query = force_metric or classify.get("question_type") in {"metric", "diagnostic"}
if not is_metric_query and (
any(term in lowered_question for term in follow_terms)
or (len(normalized.split()) <= FOLLOWUP_SHORT_WORDS and not has_cluster_terms)
):
classify["follow_up"] = True
if classify.get("follow_up") and state and state.claims:
if observer:
observer("followup", "answering follow-up")
reply = await engine._answer_followup(question, state, summary, classify, plan, call_llm)
scores = await engine._score_answer(question, reply, plan, call_llm)
return AnswerResult(reply, scores, _build_meta(mode, call_count, call_cap, limit_hit, time_budget_hit, time_budget_sec, classify, tool_hint, started))
if observer:
observer("decompose", "decomposing")
decompose_prompt = prompts.DECOMPOSE_PROMPT.format(max_parts=plan.max_subquestions * 2)
decompose_raw = await call_llm(prompts.DECOMPOSE_SYSTEM, decompose_prompt + "\nQuestion: " + normalized, context=lexicon_ctx, model=plan.fast_model if mode == "quick" else plan.model, tag="decompose")
parts = _parse_json_list(decompose_raw)
sub_questions = _select_subquestions(parts, normalized, plan.max_subquestions)
_debug_pipeline_log(settings, "decompose_parsed", {"sub_questions": sub_questions})
keyword_tokens = _extract_keywords(question, normalized, sub_questions=sub_questions, keywords=keywords)
snapshot_context = ""
signal_tokens: list[str] = []
if classify.get("needs_snapshot"):
if observer:
observer("retrieve", "scoring chunks")
chunks = _chunk_lines(summary_lines, plan.chunk_lines)
if plan.use_raw_snapshot:
raw_chunks = _raw_snapshot_chunks(snapshot_used)
if raw_chunks:
chunks.extend(raw_chunks)
kb_lines = engine._kb.chunk_lines(max_files=plan.kb_max_files, max_chars=plan.kb_max_chars) if engine._kb else []
if kb_lines:
kb_chunks = _chunk_lines(kb_lines, plan.chunk_lines)
for idx, chunk in enumerate(kb_chunks):
chunk["id"] = f"k{idx}"
chunks.extend(kb_chunks)
metric_keys: list[str] = []
must_chunk_ids: list[str] = []
metric_task = None
if (classify.get("question_type") in {"metric", "diagnostic"} or force_metric) and summary_lines:
metric_ctx = {"question": normalized, "sub_questions": sub_questions, "keywords": keywords, "keyword_tokens": keyword_tokens, "summary_lines": summary_lines}
metric_task = asyncio.create_task(_select_metric_chunks(call_llm, metric_ctx, chunks, plan))
scored_task = asyncio.create_task(_score_chunks(call_llm, chunks, normalized, sub_questions, plan))
if metric_task:
metric_keys, must_chunk_ids = await metric_task
scored = await scored_task
selected = _select_chunks(chunks, scored, plan, keyword_tokens, must_chunk_ids)
fact_candidates = _collect_fact_candidates(selected, limit=plan.max_subquestions * 12)
key_facts = await _select_fact_lines(call_llm, normalized, fact_candidates, plan, max_lines=max(4, plan.max_subquestions * 2))
metric_facts = []
if classify.get("question_type") in {"metric", "diagnostic"} or force_metric:
global_metric_facts: list[str] = []
if global_facts:
global_metric_facts = await _select_fact_lines(call_llm, normalized, global_facts, plan, max_lines=min(2, max(1, plan.max_subquestions)))
if not global_metric_facts and (keyword_tokens or question_tokens):
tokens = {tok for tok in (keyword_tokens or question_tokens) if tok and tok not in GENERIC_METRIC_TOKENS}
global_metric_facts = _rank_metric_lines(global_facts, tokens, max_lines=2)
if global_metric_facts:
key_facts = _merge_fact_lines(global_metric_facts, key_facts)
all_tokens = _merge_tokens(signal_tokens, keyword_tokens, question_tokens)
if plan.use_deep_retrieval:
if observer:
observer("retrieve", "extracting fact types")
fact_types = await _extract_fact_types(call_llm, normalized, keyword_tokens, plan)
if observer:
observer("retrieve", "deriving signals")
signals = await _derive_signals(call_llm, normalized, fact_types, plan)
if isinstance(signals, list):
signal_tokens = [str(item) for item in signals if item]
all_tokens = _merge_tokens(signal_tokens, keyword_tokens, question_tokens)
if observer:
observer("retrieve", "scanning chunks")
candidate_lines: list[str] = []
if signals:
for chunk in selected:
chunk_lines = chunk["text"].splitlines()
if not chunk_lines:
continue
hits = await _scan_chunk_for_signals(call_llm, normalized, signals, chunk_lines, plan)
if hits:
candidate_lines.extend(hits)
candidate_lines = list(dict.fromkeys(candidate_lines))
if candidate_lines:
if observer:
observer("retrieve", "pruning candidates")
metric_facts = await _prune_metric_candidates(call_llm, normalized, candidate_lines, plan, plan.metric_retries)
if metric_facts:
key_facts = _merge_fact_lines(metric_facts, key_facts)
if settings.debug_pipeline:
_debug_pipeline_log(settings, "metric_facts_selected", {"facts": metric_facts})
if not metric_facts:
if observer:
observer("retrieve", "fallback metric selection")
token_set = {tok for tok in all_tokens if tok and tok not in GENERIC_METRIC_TOKENS}
fallback_candidates = _rank_metric_lines(summary_lines, token_set, max_lines=200)
if fallback_candidates:
metric_facts = await _select_fact_lines(call_llm, normalized, fallback_candidates, plan, max_lines=max(2, plan.max_subquestions))
if not metric_facts and fallback_candidates:
metric_facts = fallback_candidates[: max(2, plan.max_subquestions)]
if metric_keys:
key_lines = _lines_for_metric_keys(summary_lines, metric_keys, max_lines=plan.max_subquestions * 3)
if key_lines:
metric_facts = _merge_fact_lines(key_lines, metric_facts)
if metric_facts:
metric_cover_tokens = [tok for tok in keyword_tokens if tok and tok not in GENERIC_METRIC_TOKENS]
if not metric_cover_tokens:
metric_cover_tokens = [tok for tok in question_tokens if tok and tok not in GENERIC_METRIC_TOKENS]
metric_facts = _ensure_token_coverage(metric_facts, metric_cover_tokens or all_tokens, summary_lines, max_add=plan.max_subquestions)
if metric_cover_tokens:
ranked_metric_lines = _rank_metric_lines(summary_lines, set(metric_cover_tokens), max_lines=max(1, plan.max_subquestions))
if ranked_metric_lines:
metric_facts = _merge_fact_lines(ranked_metric_lines, metric_facts)
if metric_facts and not _has_keyword_overlap(metric_facts, keyword_tokens):
best_line = _best_keyword_line(summary_lines, keyword_tokens)
if best_line:
metric_facts = _merge_fact_lines([best_line], metric_facts)
if metric_facts:
key_facts = _merge_fact_lines(metric_facts, key_facts)
if global_metric_facts:
metric_facts = _merge_fact_lines(global_metric_facts, metric_facts)
if (classify.get("question_type") in {"metric", "diagnostic"} or force_metric) and not metric_facts and key_facts:
metric_facts = key_facts
if key_facts:
key_facts = _ensure_token_coverage(key_facts, _merge_tokens(keyword_tokens, question_tokens), summary_lines, max_add=plan.max_subquestions)
facts_used = list(dict.fromkeys(key_facts)) if key_facts else list(dict.fromkeys(metric_facts))
snapshot_context = "ClusterSnapshot:\n" + "\n".join([chunk["text"] for chunk in selected])
combined_facts = _merge_fact_lines(global_facts, key_facts) if global_facts else key_facts
if combined_facts:
snapshot_context = "KeyFacts:\n" + "\n".join(combined_facts) + "\n\n" + snapshot_context
context = _join_context([kb_summary, _format_runbooks(runbooks), snapshot_context, history_ctx if classify.get("follow_up") else ""])
if plan.use_tool and classify.get("needs_tool"):
if observer:
observer("tool", "suggesting tools")
tool_prompt = prompts.TOOL_PROMPT + "\nQuestion: " + normalized
tool_raw = await call_llm(prompts.TOOL_SYSTEM, tool_prompt, context=context, model=plan.fast_model, tag="tool")
tool_hint = _parse_json_block(tool_raw, fallback={})
if observer:
observer("subanswers", "drafting subanswers")
async def _subanswer_for(subq: str) -> str:
sub_prompt = prompts.SUBANSWER_PROMPT + "\nQuestion: " + subq
if plan.subanswer_retries > 1:
candidates = await _gather_limited(
[call_llm(prompts.ANSWER_SYSTEM, sub_prompt, context=context, model=plan.model, tag="subanswer") for _ in range(plan.subanswer_retries)],
plan.parallelism,
)
best_idx = await _select_best_candidate(call_llm, subq, candidates, plan, "subanswer_select")
return candidates[best_idx]
return await call_llm(prompts.ANSWER_SYSTEM, sub_prompt, context=context, model=plan.model, tag="subanswer")
subanswers: list[str] = []
if plan.parallelism > 1 and len(sub_questions) > 1:
subanswers = await _gather_limited([_subanswer_for(subq) for subq in sub_questions], plan.parallelism)
else:
for subq in sub_questions:
subanswers.append(await _subanswer_for(subq))
if observer:
observer("synthesize", "synthesizing")
reply, scores, claims = await finalize_answer(
engine=engine,
call_llm=call_llm,
normalized=normalized,
subanswers=subanswers,
context=context,
classify=classify,
plan=plan,
summary=summary,
summary_lines=summary_lines,
metric_facts=metric_facts,
key_facts=key_facts,
facts_used=facts_used,
allowed_nodes=allowed_nodes,
allowed_namespaces=allowed_namespaces,
runbook_paths=runbook_paths,
lowered_question=lowered_question,
force_metric=force_metric,
keyword_tokens=keyword_tokens,
question_tokens=question_tokens,
snapshot_context=snapshot_context,
observer=observer,
mode=mode,
metric_keys=metric_keys if 'metric_keys' in locals() else None,
)
except LLMTimeBudgetExceeded:
time_budget_hit = True
if not reply:
budget = max(1, round(time_budget_sec)) if time_budget_sec > 0 else 0
budget_text = f"{budget}s" if budget else "its configured"
if mode in {"quick", "fast"}:
reply = f"Quick mode hit {budget_text} time budget before finishing. Try atlas-smart for a deeper answer."
elif mode == "smart":
reply = f"Smart mode hit {budget_text} time budget before finishing. Try atlas-genius or ask a narrower follow-up."
else:
reply = "I ran out of time before I could finish this answer."
scores = _default_scores()
except LLMLimitReached:
if not reply:
reply = "I started working on this but hit my reasoning limit. Ask again with 'Run limitless' for a deeper pass."
scores = _default_scores()
finally:
elapsed = round(time.monotonic() - started, 2)
log.info(
"atlasbot_answer",
extra={
"extra": {
"mode": mode,
"seconds": elapsed,
"llm_calls": call_count,
"limit": call_cap,
"limit_hit": limit_hit,
"time_budget_sec": time_budget_sec,
"time_budget_hit": time_budget_hit,
}
},
)
if limit_hit and "run limitless" not in reply.lower():
reply = reply.rstrip() + "\n\nNote: I hit my reasoning limit. Ask again with 'Run limitless' for a deeper pass."
if conversation_id and claims:
engine._store_state(conversation_id, claims, summary, snapshot_used, pin_snapshot)
return AnswerResult(
reply,
scores,
_build_meta(mode, call_count, call_cap, limit_hit, time_budget_hit, time_budget_sec, classify, tool_hint, started),
)

View File

@ -0,0 +1,170 @@
from __future__ import annotations
import json
import re
from collections.abc import Callable
from typing import Any
from atlasbot.llm import prompts
from ._base import *
from .common import *
from .post import *
from .post_ext import *
from .retrieval import *
from .spine import *
async def finalize_answer(*, engine: Any, call_llm: Callable[..., Any], normalized: str, subanswers: list[str], context: str, classify: dict[str, Any], plan: ModePlan, summary: dict[str, Any], summary_lines: list[str], metric_facts: list[str], key_facts: list[str], facts_used: list[str], allowed_nodes: list[str], allowed_namespaces: list[str], runbook_paths: list[str], lowered_question: str, force_metric: bool, keyword_tokens: list[str], question_tokens: list[str], snapshot_context: str, observer: Callable[[str, str], None] | None, mode: str, metric_keys: list[str] | None = None) -> tuple[str, AnswerScores, list[ClaimItem]]: # noqa: C901
"""Synthesize and post-process the final answer."""
reply = await engine._synthesize_answer(normalized, subanswers, context, classify, plan, call_llm)
unknown_nodes = _find_unknown_nodes(reply, allowed_nodes)
unknown_namespaces = _find_unknown_namespaces(reply, allowed_namespaces)
runbook_fix = _needs_runbook_fix(reply, runbook_paths)
runbook_needed = _needs_runbook_reference(normalized, runbook_paths, reply)
needs_evidence = _needs_evidence_fix(reply, classify)
hardware_terms = ("rpi", "raspberry", "jetson", "amd64", "arm64", "hardware")
hardware_line = _line_starting_with(summary_lines, "hardware_nodes:")
if any(term in lowered_question for term in hardware_terms) and hardware_line:
needs_evidence = True
if metric_facts and (classify.get("question_type") in {"metric", "diagnostic"} or force_metric) and not _reply_matches_metric_facts(reply, metric_facts, _merge_tokens(keyword_tokens, question_tokens)):
needs_evidence = True
if classify.get("question_type") in {"open_ended", "planning"} and metric_facts:
needs_evidence = True
resolved_runbook = None
if runbook_paths and (runbook_fix or runbook_needed):
resolver_prompt = prompts.RUNBOOK_SELECT_PROMPT + "\nQuestion: " + normalized
resolver_raw = await call_llm(prompts.RUNBOOK_SELECT_SYSTEM, resolver_prompt, context="AllowedRunbooks:\n" + "\n".join(runbook_paths), model=plan.fast_model, tag="runbook_select")
resolver = _parse_json_block(resolver_raw, fallback={})
candidate = resolver.get("path") if isinstance(resolver.get("path"), str) else None
if candidate and candidate in runbook_paths:
resolved_runbook = candidate
if (snapshot_context and needs_evidence) or unknown_nodes or unknown_namespaces or runbook_fix or runbook_needed:
if observer:
observer("evidence_fix", "repairing missing evidence")
extra_bits = []
if unknown_nodes:
extra_bits.append("UnknownNodes: " + ", ".join(sorted(unknown_nodes)))
if unknown_namespaces:
extra_bits.append("UnknownNamespaces: " + ", ".join(sorted(unknown_namespaces)))
if runbook_paths:
extra_bits.append("AllowedRunbooks: " + ", ".join(runbook_paths))
if resolved_runbook:
extra_bits.append("ResolvedRunbook: " + resolved_runbook)
if metric_facts:
extra_bits.append("MustUseFacts: " + "; ".join(metric_facts[:4]))
if hardware_line:
extra_bits.append("HardwareNodes: " + hardware_line)
if allowed_nodes:
extra_bits.append("AllowedNodes: " + ", ".join(allowed_nodes))
if allowed_namespaces:
extra_bits.append("AllowedNamespaces: " + ", ".join(allowed_namespaces))
fix_prompt = prompts.EVIDENCE_FIX_PROMPT + "\nQuestion: " + normalized + "\nDraft: " + reply + ("\n" + "\n".join(extra_bits) if extra_bits else "")
reply = await call_llm(prompts.EVIDENCE_FIX_SYSTEM, fix_prompt, context=context, model=plan.model, tag="evidence_fix")
if metric_facts and not _reply_matches_metric_facts(reply, metric_facts, _merge_tokens(keyword_tokens, question_tokens)):
enforce_prompt = prompts.EVIDENCE_FIX_PROMPT + "\nQuestion: " + normalized + "\nDraft: " + reply + "\nMustIncludeFacts: " + "; ".join(metric_facts[:6]) + "\nInstruction: The answer must include all MustIncludeFacts items."
reply = await call_llm(prompts.EVIDENCE_FIX_SYSTEM, enforce_prompt, context=context, model=plan.model, tag="evidence_fix_enforce")
if metric_facts and not _reply_matches_metric_facts(reply, metric_facts, _merge_tokens(keyword_tokens, question_tokens)):
direct_candidates = _lines_for_metric_keys(summary_lines, metric_keys, max_lines=plan.max_subquestions * 3) if 'metric_keys' in locals() and metric_keys else summary_lines
direct_line = _select_metric_line(direct_candidates, normalized, _merge_tokens(keyword_tokens, question_tokens))
if direct_line:
direct_prompt = f"Question: {normalized}\nFact: {direct_line}\nAnswer using the fact."
reply = await call_llm(prompts.ANSWER_SYSTEM, direct_prompt, context="", model=plan.fast_model, tag="metric_direct")
if (mode == "quick" and any(term in normalized.lower() for term in ("how many", "count", "total"))) or not _reply_matches_metric_facts(reply, [direct_line], _merge_tokens(keyword_tokens, question_tokens)):
reply = _format_direct_metric_line(direct_line)
if "raspberry" in lowered_question and "not" in lowered_question:
non_rpi = _non_rpi_nodes(summary)
if non_rpi:
reply = _format_hardware_groups(non_rpi, "Non-Raspberry Pi nodes")
if unknown_nodes or unknown_namespaces:
refreshed_nodes = _find_unknown_nodes(reply, allowed_nodes)
refreshed_namespaces = _find_unknown_namespaces(reply, allowed_namespaces)
if refreshed_nodes or refreshed_namespaces:
reply = _strip_unknown_entities(reply, refreshed_nodes, refreshed_namespaces)
if runbook_paths and resolved_runbook and _needs_runbook_reference(normalized, runbook_paths, reply):
if observer:
observer("runbook_enforce", "enforcing runbook path")
enforce_prompt = prompts.RUNBOOK_ENFORCE_PROMPT.format(path=resolved_runbook)
reply = await call_llm(prompts.RUNBOOK_ENFORCE_SYSTEM, enforce_prompt + "\nAnswer: " + reply, context=context, model=plan.model, tag="runbook_enforce")
if runbook_paths:
invalid = [token for token in re.findall(r"runbooks/[A-Za-z0-9._-]+", reply) if token.lower() not in {p.lower() for p in runbook_paths}]
if invalid:
if observer:
observer("runbook_enforce", "replacing invalid runbook path")
resolver_prompt = prompts.RUNBOOK_SELECT_PROMPT + "\nQuestion: " + normalized
resolver_raw = await call_llm(prompts.RUNBOOK_SELECT_SYSTEM, resolver_prompt, context="AllowedRunbooks:\n" + "\n".join(runbook_paths), model=plan.fast_model, tag="runbook_select")
resolver = _parse_json_block(resolver_raw, fallback={})
candidate = resolver.get("path") if isinstance(resolver.get("path"), str) else None
if not (candidate and candidate in runbook_paths):
candidate = _best_runbook_match(invalid[0], runbook_paths)
if candidate and candidate in runbook_paths:
enforce_prompt = prompts.RUNBOOK_ENFORCE_PROMPT.format(path=candidate)
reply = await call_llm(prompts.RUNBOOK_ENFORCE_SYSTEM, enforce_prompt + "\nAnswer: " + reply, context=context, model=plan.model, tag="runbook_enforce")
reply = _strip_unknown_entities(reply, unknown_nodes, unknown_namespaces)
if facts_used and _needs_evidence_guard(reply, facts_used):
if observer:
observer("evidence_guard", "tightening unsupported claims")
use_guard = True
if mode in {"smart", "genius"}:
decision = await _contradiction_decision(ContradictionContext(call_llm, normalized, reply, facts_used, plan), attempts=3 if mode == "genius" else 1)
use_guard = decision.get("use_facts", True)
if use_guard:
guard_prompt = prompts.EVIDENCE_GUARD_PROMPT + "\nQuestion: " + normalized + "\nDraft: " + reply + "\nFactsUsed:\n" + "\n".join(facts_used)
reply = await call_llm(prompts.EVIDENCE_GUARD_SYSTEM, guard_prompt, context=context, model=plan.model, tag="evidence_guard")
if _needs_focus_fix(normalized, reply, classify):
if observer:
observer("focus_fix", "tightening answer")
reply = await call_llm(prompts.EVIDENCE_FIX_SYSTEM, prompts.FOCUS_FIX_PROMPT + "\nQuestion: " + normalized + "\nDraft: " + reply, context=context, model=plan.model, tag="focus_fix")
if not metric_facts or not _has_keyword_overlap(metric_facts, keyword_tokens):
best_line = _best_keyword_line(summary_lines, keyword_tokens)
if best_line:
reply = f"Latest metrics: {best_line}."
if (classify.get("question_type") in {"metric", "diagnostic"} or force_metric) and metric_facts:
best_line = None
lowered_keywords = [kw.lower() for kw in keyword_tokens if kw]
for line in metric_facts:
if any(kw in line.lower() for kw in lowered_keywords):
best_line = line
break
best_line = best_line or metric_facts[0]
reply_numbers = set(re.findall(r"\d+(?:\.\d+)?", reply))
fact_numbers = set(re.findall(r"\d+(?:\.\d+)?", " ".join(metric_facts)))
if not reply_numbers or (fact_numbers and not (reply_numbers & fact_numbers)):
reply = f"Latest metrics: {best_line}."
if _should_use_insight_guard(classify):
if observer:
observer("insight_guard", "checking for concrete signals")
reply = await _apply_insight_guard(InsightGuardInput(question=normalized, reply=reply, classify=classify, context=context, plan=plan, call_llm=call_llm, facts=metric_facts or key_facts))
if plan.use_critic:
if observer:
observer("critic", "reviewing")
critic_prompt = prompts.CRITIC_PROMPT + "\nQuestion: " + normalized + "\nAnswer: " + reply
critic_raw = await call_llm(prompts.CRITIC_SYSTEM, critic_prompt, context=context, model=plan.model, tag="critic")
critic = _parse_json_block(critic_raw, fallback={})
if critic.get("issues"):
revise_prompt = prompts.REVISION_PROMPT + "\nQuestion: " + normalized + "\nDraft: " + reply + "\nCritique: " + json.dumps(critic)
reply = await call_llm(prompts.REVISION_SYSTEM, revise_prompt, context=context, model=plan.model, tag="revise")
if plan.use_gap:
if observer:
observer("gap", "checking gaps")
gap_prompt = prompts.EVIDENCE_GAP_PROMPT + "\nQuestion: " + normalized + "\nAnswer: " + reply
gap_raw = await call_llm(prompts.GAP_SYSTEM, gap_prompt, context=context, model=plan.fast_model, tag="gap")
gap = _parse_json_block(gap_raw, fallback={})
note = str(gap.get("note") or "").strip()
if note:
reply = f"{reply}\n\n{note}"
reply = await engine._dedup_reply(reply, plan, call_llm, tag="dedup")
scores = await engine._score_answer(normalized, reply, plan, call_llm)
claims = await engine._extract_claims(normalized, reply, summary, facts_used, call_llm)
return reply, scores, claims

View File

@ -1,35 +1,46 @@
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass
import re import re
from dataclasses import dataclass
@dataclass(frozen=True) @dataclass(frozen=True)
class IntentMatch: class IntentMatch:
"""Describe the best cluster intent match for a user question."""
kind: str kind: str
score: int score: int
_COUNT_TERMS = r"(how\\s+many|count|number\\s+of|total|totals|tally|amount\\s+of|quantity|sum\\s+of|overall|in\\s+total|all\\s+up)" _COUNT_TERMS = r"(how\s+many|count|number\s+of|total|totals|tally|amount\s+of|quantity|sum\s+of|overall|in\s+total|all\s+up)"
_NODE_TERMS = r"(nodes?|workers?|worker\\s+nodes?|cluster\\s+nodes?|machines?|hosts?|members?|instances?|servers?|agents?|control[-\\s]?plane|control\\s+plane)" _NODE_TERMS = r"(nodes?|workers?|worker\s+nodes?|cluster\s+nodes?|machines?|hosts?|members?|instances?|servers?|agents?|control[-\s]?plane|control\s+plane)"
_READY_TERMS = r"(ready|unready|not\\s+ready|down|offline|not\\s+responding|missing|lost|gone|drain(?:ed|ing)?|cordon(?:ed|ing)?)" _READY_TERMS = r"(ready|unready|not\s+ready|down|offline|not\s+responding|missing|lost|gone|drain(?:ed|ing)?|cordon(?:ed|ing)?)"
_HOTTEST_TERMS = r"(hottest|hot|highest|max(?:imum)?|peak|top|most|worst|spikiest|heaviest|largest|biggest|noisiest|loudest)" _HOTTEST_TERMS = r"(hottest|hot|highest|max(?:imum)?|peak|top|most|worst|spikiest|heaviest|largest|biggest|noisiest|loudest)"
_CPU_TERMS = r"(cpu|processor|processors|compute|core|cores|load|load\\s+avg|load\\s+average|util(?:ization)?|usage)" _CPU_TERMS = r"(cpu|processor|processors|compute|core|cores|load|load\s+avg|load\s+average|util(?:ization)?|usage)"
_RAM_TERMS = r"(ram|memory|mem|heap|rss|resident|swap)" _RAM_TERMS = r"(ram|memory|mem|heap|rss|resident|swap)"
_NET_TERMS = r"(net|network|bandwidth|throughput|traffic|rx|tx|ingress|egress|bits|bytes|packets|pps|bps)" _NET_TERMS = r"(net|network|bandwidth|throughput|traffic|rx|tx|ingress|egress|bits|bytes|packets|pps|bps)"
_IO_TERMS = r"(\\bio\\b|i/o|disk\\s+io|disk\\s+activity|read/?write|storage\\s+io|iops|latency)" _IO_TERMS = r"(\bio\b|i/o|disk\s+io|disk\s+activity|read/?write|storage\s+io|iops|latency)"
_DISK_TERMS = r"(disk|storage|volume|pvc|filesystem|fs|capacity|\\bspace\\b|full|usage)" _DISK_TERMS = r"(disk|storage|volume|pvc|filesystem|fs|capacity|\bspace\b|full|usage)"
_PG_TERMS = r"(postgres|postgresql|pg\\b|database|db|sql|psql)" _PG_TERMS = r"(postgres|postgresql|pg\b|database|db|sql|psql)"
_CONN_TERMS = r"(connections?|conn|pool|sessions?|clients?|active\\s+connections?|open\\s+connections?)" _CONN_TERMS = r"(connections?|conn|pool|sessions?|clients?|active\s+connections?|open\s+connections?)"
_DB_HOT_TERMS = r"(hottest|busiest|most|largest|top|heaviest|noisiest|highest\\s+load)" _DB_HOT_TERMS = r"(hottest|busiest|most|largest|top|heaviest|noisiest|highest\s+load)"
_NAMESPACE_TERMS = r"(namespace|namespaces|ns\\b|tenant|workload\\s+namespace)" _NAMESPACE_TERMS = r"(namespace|namespaces|ns\b|tenant|workload\s+namespace)"
_PODS_TERMS = r"(pods?|workloads?|tasks?|containers?|deployments?|jobs?|cronjobs?|daemonsets?|statefulsets?)" _PODS_TERMS = r"(pods?|workloads?|tasks?|containers?|deployments?|jobs?|cronjobs?|daemonsets?|statefulsets?)"
_NON_RPI_TERMS = r"(non[-\\s]?raspberry|not\\s+raspberry|non[-\\s]?rpi|not\\s+rpi|amd64|x86|x86_64|intel|ryzen|jetson|arm64\\b(?!.*rpi))" _NON_RPI_TERMS = r"(non[-\s]?raspberry|not\s+raspberry|non[-\s]?rpi|not\s+rpi|amd64|x86|x86_64|intel|ryzen|jetson|arm64\b(?!.*rpi))"
_PRESSURE_TERMS = r"(pressure|overload|hotspot|bottleneck|saturation|headroom|strain|stress|critical|warning|at\\s+capacity|near\\s+limit)" _PRESSURE_TERMS = r"(pressure|overload|hotspot|bottleneck|saturation|headroom|strain|stress|critical|warning|at\s+capacity|near\s+limit)"
_HARDWARE_TERMS = r"(hardware|arch(?:itecture)?|platform|mix|profile|node\\s+types?)" _HARDWARE_TERMS = r"(hardware|arch(?:itecture)?|platform|mix|profile|node\s+types?)"
def route_intent(question: str) -> IntentMatch | None: def route_intent(question: str) -> IntentMatch | None:
"""Classify a question into a deterministic cluster intent.
Input:
- `question`: user text to inspect.
Output:
- the highest-confidence `IntentMatch`, or `None` when no intent fits.
"""
text = (question or "").lower() text = (question or "").lower()
if not text: if not text:
return None return None
@ -44,13 +55,13 @@ def route_intent(question: str) -> IntentMatch | None:
return any(_has(pat) for pat in patterns) return any(_has(pat) for pat in patterns)
intents = [ intents = [
(lambda: _all(_COUNT_TERMS) and (_has(_NODE_TERMS) or "cluster" in text), IntentMatch("nodes_count", 90)),
( (
lambda: _all(_READY_TERMS) and (_any(_NODE_TERMS) or "cluster" in text or "workers" in text), lambda: _all(_READY_TERMS) and (_any(_NODE_TERMS) or "cluster" in text or "workers" in text),
IntentMatch("nodes_ready", 85), IntentMatch("nodes_ready", 85),
), ),
(lambda: _all(_COUNT_TERMS) and (_has(_NODE_TERMS) or "cluster" in text), IntentMatch("nodes_count", 90)),
(lambda: _all(_NON_RPI_TERMS) and (_any(_NODE_TERMS) or "cluster" in text), IntentMatch("nodes_non_rpi", 80)), (lambda: _all(_NON_RPI_TERMS) and (_any(_NODE_TERMS) or "cluster" in text), IntentMatch("nodes_non_rpi", 80)),
(lambda: _all(_HARDWARE_TERMS) and (_has(_NODE_TERMS) or "cluster" in text), IntentMatch("hardware_mix", 75)), (lambda: _all(_HARDWARE_TERMS) and (_has(_NODE_TERMS) or "cluster" in text or "mix" in text), IntentMatch("hardware_mix", 75)),
(lambda: _all(_HOTTEST_TERMS, _CPU_TERMS), IntentMatch("hottest_cpu", 80)), (lambda: _all(_HOTTEST_TERMS, _CPU_TERMS), IntentMatch("hottest_cpu", 80)),
(lambda: _all(_HOTTEST_TERMS, _RAM_TERMS), IntentMatch("hottest_ram", 80)), (lambda: _all(_HOTTEST_TERMS, _RAM_TERMS), IntentMatch("hottest_ram", 80)),
(lambda: _all(_HOTTEST_TERMS, _NET_TERMS), IntentMatch("hottest_net", 80)), (lambda: _all(_HOTTEST_TERMS, _NET_TERMS), IntentMatch("hottest_net", 80)),

View File

@ -7,6 +7,8 @@ log = logging.getLogger(__name__)
class KnowledgeBase: class KnowledgeBase:
"""Load Atlas knowledge-base files and expose summary snippets."""
def __init__(self, base_dir: str) -> None: def __init__(self, base_dir: str) -> None:
self._base = Path(base_dir) if base_dir else None self._base = Path(base_dir) if base_dir else None
self._atlas: dict[str, Any] = {} self._atlas: dict[str, Any] = {}
@ -14,6 +16,8 @@ class KnowledgeBase:
self._loaded = False self._loaded = False
def load(self) -> None: def load(self) -> None:
"""Load catalog files once so subsequent reads stay cheap."""
if self._loaded or not self._base: if self._loaded or not self._base:
return return
self._atlas = self._read_json(self._base / "catalog" / "atlas.json") self._atlas = self._read_json(self._base / "catalog" / "atlas.json")
@ -30,6 +34,8 @@ class KnowledgeBase:
return {} return {}
def summary(self) -> str: def summary(self) -> str:
"""Return a short human-readable KB summary for prompt context."""
self.load() self.load()
if not self._atlas: if not self._atlas:
return "" return ""
@ -42,12 +48,14 @@ class KnowledgeBase:
if services: if services:
parts.append(f"Services indexed: {len(services)}.") parts.append(f"Services indexed: {len(services)}.")
if isinstance(self._atlas, dict): if isinstance(self._atlas, dict):
keys = [key for key in self._atlas.keys() if key not in {"sources"}] keys = [key for key in self._atlas if key not in {"sources"}]
if keys: if keys:
parts.append(f"Atlas keys: {', '.join(sorted(keys)[:8])}.") parts.append(f"Atlas keys: {', '.join(sorted(keys)[:8])}.")
return " ".join(parts) return " ".join(parts)
def runbook_titles(self, *, limit: int = 5) -> str: def runbook_titles(self, *, limit: int = 5) -> str:
"""Render the top runbook titles for prompt context."""
self.load() self.load()
if not self._runbooks: if not self._runbooks:
return "" return ""
@ -64,6 +72,8 @@ class KnowledgeBase:
return "Relevant runbooks:\n" + "\n".join(titles[:limit]) return "Relevant runbooks:\n" + "\n".join(titles[:limit])
def runbook_paths(self, *, limit: int = 10) -> list[str]: def runbook_paths(self, *, limit: int = 10) -> list[str]:
"""Return the runbook paths used for exact-path enforcement."""
self.load() self.load()
if not self._runbooks: if not self._runbooks:
return [] return []
@ -77,6 +87,8 @@ class KnowledgeBase:
return paths[:limit] return paths[:limit]
def chunk_lines(self, *, max_files: int = 20, max_chars: int = 6000) -> list[str]: def chunk_lines(self, *, max_files: int = 20, max_chars: int = 6000) -> list[str]:
"""Collect KB excerpts into prompt-sized chunks."""
self.load() self.load()
if not self._base: if not self._base:
return [] return []

View File

@ -17,6 +17,8 @@ class LLMError(RuntimeError):
class LLMClient: class LLMClient:
"""Wrap the Ollama chat endpoint with retries and fallback-model support."""
def __init__(self, settings: Settings) -> None: def __init__(self, settings: Settings) -> None:
self._settings = settings self._settings = settings
self._timeout = settings.ollama_timeout_sec self._timeout = settings.ollama_timeout_sec
@ -37,6 +39,8 @@ class LLMClient:
model: str | None = None, model: str | None = None,
timeout_sec: float | None = None, timeout_sec: float | None = None,
) -> str: ) -> str:
"""Send a chat request and return the model content text."""
payload = { payload = {
"model": model or self._settings.ollama_model, "model": model or self._settings.ollama_model,
"messages": messages, "messages": messages,
@ -77,6 +81,8 @@ class LLMClient:
def build_messages(system: str, prompt: str, *, context: str | None = None) -> list[dict[str, str]]: def build_messages(system: str, prompt: str, *, context: str | None = None) -> list[dict[str, str]]:
"""Assemble the minimal chat message list used by the answer pipeline."""
messages: list[dict[str, str]] = [{"role": "system", "content": system}] messages: list[dict[str, str]] = [{"role": "system", "content": system}]
if context: if context:
messages.append({"role": "user", "content": "Context (grounded facts):\n" + context}) messages.append({"role": "user", "content": "Context (grounded facts):\n" + context})
@ -85,6 +91,8 @@ def build_messages(system: str, prompt: str, *, context: str | None = None) -> l
def parse_json(text: str, *, fallback: dict[str, Any] | None = None) -> dict[str, Any]: def parse_json(text: str, *, fallback: dict[str, Any] | None = None) -> dict[str, Any]:
"""Parse a JSON blob from model output and fall back to a safe default."""
try: try:
raw = text.strip() raw = text.strip()
if raw.startswith("`"): if raw.startswith("`"):

View File

@ -253,7 +253,7 @@ CONTRADICTION_PROMPT = (
"Question: {question}\n" "Question: {question}\n"
"Draft: {draft}\n" "Draft: {draft}\n"
"FactsUsed:\n{facts}\n\n" "FactsUsed:\n{facts}\n\n"
"Return JSON: {\"use_facts\": true|false, \"confidence\": 0-100, \"reason\": \"...\"}" "Return JSON: {{\"use_facts\": true|false, \"confidence\": 0-100, \"reason\": \"...\"}}"
) )
CANDIDATE_SELECT_SYSTEM = ( CANDIDATE_SELECT_SYSTEM = (

View File

@ -1,13 +1,17 @@
import json import json
import logging import logging
import sys import sys
from datetime import datetime, timezone from datetime import UTC, datetime
class JsonFormatter(logging.Formatter): class JsonFormatter(logging.Formatter):
"""Emit structured log records for the atlasbot services."""
def format(self, record: logging.LogRecord) -> str: def format(self, record: logging.LogRecord) -> str:
"""Render a log record as JSON for downstream ingestion."""
payload = { payload = {
"timestamp": datetime.now(timezone.utc).isoformat(), "timestamp": datetime.now(UTC).isoformat(),
"level": record.levelname.lower(), "level": record.levelname.lower(),
"logger": record.name, "logger": record.name,
"message": record.getMessage(), "message": record.getMessage(),
@ -21,6 +25,8 @@ class JsonFormatter(logging.Formatter):
def configure_logging(level: str = "INFO") -> None: def configure_logging(level: str = "INFO") -> None:
"""Install JSON logging on the process root logger."""
root = logging.getLogger() root = logging.getLogger()
root.setLevel(level.upper()) root.setLevel(level.upper())
handler = logging.StreamHandler(sys.stdout) handler = logging.StreamHandler(sys.stdout)

View File

@ -17,6 +17,8 @@ log = logging.getLogger(__name__)
def _build_engine(settings) -> AnswerEngine: def _build_engine(settings) -> AnswerEngine:
"""Construct the answer engine from the configured backends."""
kb = KnowledgeBase(settings.kb_dir) kb = KnowledgeBase(settings.kb_dir)
snapshot = SnapshotProvider(settings) snapshot = SnapshotProvider(settings)
llm = LLMClient(settings) llm = LLMClient(settings)
@ -24,6 +26,8 @@ def _build_engine(settings) -> AnswerEngine:
async def main() -> None: async def main() -> None:
"""Start the HTTP API, Matrix bots, and queue worker."""
settings = load_settings() settings = load_settings()
configure_logging("INFO") configure_logging("INFO")
@ -45,14 +49,7 @@ async def main() -> None:
queue = QueueManager(settings, handler) queue = QueueManager(settings, handler)
await queue.start() await queue.start()
async def answer_handler( # noqa: PLR0913 async def answer_handler(question: str, mode: str, history=None, conversation_id=None, snapshot_pin: bool | None = None, observer=None) -> AnswerResult:
question: str,
mode: str,
history=None,
conversation_id=None,
snapshot_pin: bool | None = None,
observer=None,
) -> AnswerResult:
if settings.queue_enabled: if settings.queue_enabled:
payload = await queue.submit( payload = await queue.submit(
{ {
@ -86,6 +83,8 @@ async def main() -> None:
def result_scores(payload: dict[str, object]) -> AnswerScores: def result_scores(payload: dict[str, object]) -> AnswerScores:
"""Coerce a queue payload into the public `AnswerScores` shape."""
scores = payload.get("scores") if isinstance(payload, dict) else None scores = payload.get("scores") if isinstance(payload, dict) else None
if isinstance(scores, dict): if isinstance(scores, dict):
try: try:

View File

@ -15,11 +15,15 @@ log = logging.getLogger(__name__)
class MatrixClient: class MatrixClient:
"""Wrap the Matrix client endpoints used by the bot runtime."""
def __init__(self, settings: Settings, bot: MatrixBotConfig) -> None: def __init__(self, settings: Settings, bot: MatrixBotConfig) -> None:
self._settings = settings self._settings = settings
self._bot = bot self._bot = bot
async def login(self) -> str: async def login(self) -> str:
"""Exchange bot credentials for a Matrix access token."""
payload = { payload = {
"type": "m.login.password", "type": "m.login.password",
"identifier": {"type": "m.id.user", "user": self._bot.username}, "identifier": {"type": "m.id.user", "user": self._bot.username},
@ -33,6 +37,8 @@ class MatrixClient:
return data.get("access_token", "") return data.get("access_token", "")
async def resolve_room(self, token: str) -> str: async def resolve_room(self, token: str) -> str:
"""Resolve the configured room alias into a room id."""
alias = quote(self._settings.room_alias, safe="") alias = quote(self._settings.room_alias, safe="")
url = f"{self._settings.matrix_base}/_matrix/client/v3/directory/room/{alias}" url = f"{self._settings.matrix_base}/_matrix/client/v3/directory/room/{alias}"
headers = {"Authorization": f"Bearer {token}"} headers = {"Authorization": f"Bearer {token}"}
@ -50,12 +56,16 @@ class MatrixClient:
return data.get("room_id", "") return data.get("room_id", "")
async def join_room(self, token: str, room_id: str) -> None: async def join_room(self, token: str, room_id: str) -> None:
"""Join the target room if the bot is not already present."""
url = f"{self._settings.matrix_base}/_matrix/client/v3/rooms/{room_id}/join" url = f"{self._settings.matrix_base}/_matrix/client/v3/rooms/{room_id}/join"
headers = {"Authorization": f"Bearer {token}"} headers = {"Authorization": f"Bearer {token}"}
async with httpx.AsyncClient(timeout=15.0) as client: async with httpx.AsyncClient(timeout=15.0) as client:
await client.post(url, headers=headers) await client.post(url, headers=headers)
async def send_message(self, token: str, room_id: str, text: str) -> None: async def send_message(self, token: str, room_id: str, text: str) -> None:
"""Send a plain text message to the Matrix room."""
url = f"{self._settings.matrix_base}/_matrix/client/v3/rooms/{room_id}/send/m.room.message" url = f"{self._settings.matrix_base}/_matrix/client/v3/rooms/{room_id}/send/m.room.message"
headers = {"Authorization": f"Bearer {token}"} headers = {"Authorization": f"Bearer {token}"}
payload = {"msgtype": "m.text", "body": text} payload = {"msgtype": "m.text", "body": text}
@ -63,6 +73,8 @@ class MatrixClient:
await client.post(url, json=payload, headers=headers) await client.post(url, json=payload, headers=headers)
async def sync(self, token: str, since: str | None) -> dict[str, Any]: async def sync(self, token: str, since: str | None) -> dict[str, Any]:
"""Fetch the incremental Matrix sync payload."""
base = f"{self._settings.matrix_base}/_matrix/client/v3/sync" base = f"{self._settings.matrix_base}/_matrix/client/v3/sync"
params = {"timeout": 30000} params = {"timeout": 30000}
if since: if since:
@ -75,17 +87,9 @@ class MatrixClient:
class MatrixBot: class MatrixBot:
def __init__( """Drive Matrix conversation handling and heartbeat replies."""
self,
settings: Settings, def __init__(self, settings: Settings, bot: MatrixBotConfig, engine: AnswerEngine, answer_handler: Callable[[str, str, list[dict[str, str]] | None, str | None, Callable[[str, str], None] | None], Awaitable[AnswerResult]] | None = None) -> None:
bot: MatrixBotConfig,
engine: AnswerEngine,
answer_handler: Callable[
[str, str, list[dict[str, str]] | None, str | None, Callable[[str, str], None] | None],
Awaitable[AnswerResult],
]
| None = None,
) -> None:
self._settings = settings self._settings = settings
self._bot = bot self._bot = bot
self._engine = engine self._engine = engine
@ -94,6 +98,8 @@ class MatrixBot:
self._history: dict[str, list[dict[str, str]]] = {} self._history: dict[str, list[dict[str, str]]] = {}
async def run(self) -> None: async def run(self) -> None:
"""Continuously bootstrap, sync, and answer Matrix events."""
while True: while True:
try: try:
token = await self._client.login() token = await self._client.login()

View File

@ -1,7 +1,8 @@
import asyncio import asyncio
import json import json
import logging import logging
from typing import Any, Awaitable, Callable from collections.abc import Awaitable, Callable
from typing import Any
from nats.aio.client import Client as NATS from nats.aio.client import Client as NATS
from nats.js.errors import NotFoundError from nats.js.errors import NotFoundError
@ -12,6 +13,8 @@ log = logging.getLogger(__name__)
class QueueManager: class QueueManager:
"""Manage optional NATS-backed work queue processing."""
def __init__(self, settings: Settings, handler: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> None: def __init__(self, settings: Settings, handler: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> None:
self._settings = settings self._settings = settings
self._handler = handler self._handler = handler
@ -20,6 +23,8 @@ class QueueManager:
self._worker_task: asyncio.Task | None = None self._worker_task: asyncio.Task | None = None
async def start(self) -> None: async def start(self) -> None:
"""Connect to NATS and start the worker loop when queueing is enabled."""
if not self._settings.queue_enabled: if not self._settings.queue_enabled:
return return
self._nc = NATS() self._nc = NATS()
@ -29,12 +34,16 @@ class QueueManager:
self._worker_task = asyncio.create_task(self._worker_loop()) self._worker_task = asyncio.create_task(self._worker_loop())
async def stop(self) -> None: async def stop(self) -> None:
"""Drain the NATS connection and cancel background work."""
if self._worker_task: if self._worker_task:
self._worker_task.cancel() self._worker_task.cancel()
if self._nc: if self._nc:
await self._nc.drain() await self._nc.drain()
async def submit(self, payload: dict[str, Any]) -> dict[str, Any]: async def submit(self, payload: dict[str, Any]) -> dict[str, Any]:
"""Submit work to NATS or fall back to direct handling."""
if not self._settings.queue_enabled: if not self._settings.queue_enabled:
return await self._handler(payload) return await self._handler(payload)
if not self._nc or not self._js: if not self._nc or not self._js:

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,8 @@
"""Snapshot summary builder and text render helpers."""
from .core_a import *
from .core_b import *
from .format_a import *
from .format_b import *
from .format_c import *
from .summary_text import *

View File

@ -0,0 +1,492 @@
from __future__ import annotations
import logging
import time
from typing import Any
import httpx
from atlasbot.config import Settings
log = logging.getLogger(__name__)
PVC_USAGE_CRITICAL = 90
_BYTES_KB = 1024
_BYTES_MB = 1024 * 1024
_BYTES_GB = 1024 * 1024 * 1024
_VALUE_PAIR_LEN = 2
class SnapshotProvider:
"""Fetch and cache the Ariadne snapshot used by the answer engine."""
def __init__(self, settings: Settings) -> None:
self._settings = settings
self._cache: dict[str, Any] = {}
self._cache_ts = 0.0
def _cache_valid(self) -> bool:
return time.monotonic() - self._cache_ts < max(5, self._settings.snapshot_ttl_sec)
def get(self) -> dict[str, Any] | None:
"""Return the cached snapshot or refresh it from Ariadne."""
if self._cache and self._cache_valid():
return self._cache
if not self._settings.ariadne_state_url:
return self._cache or None
headers = {}
if self._settings.ariadne_state_token:
headers["x-internal-token"] = self._settings.ariadne_state_token
try:
resp = httpx.get(self._settings.ariadne_state_url, headers=headers, timeout=10.0)
resp.raise_for_status()
payload = resp.json()
if isinstance(payload, dict):
self._cache = payload
self._cache_ts = time.monotonic()
return payload
except Exception as exc:
log.warning("snapshot fetch failed", extra={"extra": {"error": str(exc)}})
return self._cache or None
def _node_usage_top(series: list[dict[str, Any]]) -> dict[str, Any] | None:
best = None
for entry in series or []:
if not isinstance(entry, dict):
continue
node = entry.get("node")
value = entry.get("value")
try:
numeric = float(value)
except (TypeError, ValueError):
continue
if best is None or numeric > best["value"]:
best = {"node": node, "value": numeric}
return best
def build_summary(snapshot: dict[str, Any] | None) -> dict[str, Any]:
"""Condense a raw snapshot into the summary shape used for prompts."""
if not snapshot:
return {}
from .core_b import (
_build_flux,
_build_hottest,
_build_namespace_capacity,
_build_namespace_capacity_summary,
_build_node_load_summary,
_build_pvc,
_build_workloads,
)
from .format_c import _build_cluster_watchlist
nodes_detail = _nodes_detail(snapshot)
metrics = _metrics(snapshot)
summary: dict[str, Any] = {}
if isinstance(snapshot.get("nodes_summary"), dict):
summary["nodes_summary"] = snapshot.get("nodes_summary")
if metrics:
summary["metrics"] = metrics
if isinstance(snapshot.get("jobs"), dict):
summary["jobs"] = snapshot.get("jobs")
summary.update(_build_nodes(snapshot))
summary.update(_build_pressure(snapshot))
summary.update(_build_hardware(nodes_detail))
summary.update(_build_hardware_by_node(nodes_detail))
summary.update(_build_hardware_usage(metrics, summary.get("hardware_by_node")))
summary.update(_build_node_facts(nodes_detail))
summary.update(_build_node_ages(nodes_detail))
summary.update(_build_node_taints(nodes_detail))
summary.update(_build_capacity(metrics))
summary.update(_build_pods(metrics))
summary.update(_build_namespace_pods(snapshot))
summary.update(_build_namespace_nodes(snapshot))
summary.update(_build_node_pods(snapshot))
summary.update(_build_node_pods_top(metrics))
summary.update(_build_pod_issues(snapshot))
summary.update(_build_workload_health(snapshot))
summary.update(_build_events(snapshot))
summary.update(_build_event_summary(snapshot))
summary.update(_build_postgres(metrics))
summary.update(_build_hottest(metrics))
summary.update(_build_pvc(metrics))
summary.update(_build_namespace_capacity(metrics))
summary.update(_build_namespace_capacity_summary(metrics))
summary.update(_build_longhorn(snapshot))
summary.update(_build_root_disk_headroom(metrics))
summary.update(_build_node_load(metrics))
summary.update(_build_node_load_summary(metrics))
summary.update(_build_cluster_watchlist(summary))
summary.update(_build_workloads(snapshot))
summary.update(_build_flux(snapshot))
_merge_cluster_summary(snapshot, summary)
_augment_lexicon(summary)
return summary
def _merge_cluster_summary(snapshot: dict[str, Any], summary: dict[str, Any]) -> None:
cluster_summary = snapshot.get("summary") if isinstance(snapshot.get("summary"), dict) else {}
if not cluster_summary:
return
_merge_cluster_fields(
summary,
cluster_summary,
{
"signals": list,
"profiles": dict,
"inventory": dict,
"topology": dict,
"lexicon": dict,
"cross_stats": dict,
"baseline_deltas": dict,
"pod_issue_summary": dict,
"trend_requests": dict,
"pod_waiting_trends": dict,
"pod_terminated_trends": dict,
},
)
def _merge_cluster_fields(summary: dict[str, Any], cluster_summary: dict[str, Any], field_types: dict[str, type]) -> None:
for key, expected in field_types.items():
value = cluster_summary.get(key)
if isinstance(value, expected):
summary[key] = value
def _augment_lexicon(summary: dict[str, Any]) -> None:
lexicon = summary.get("lexicon")
if not isinstance(lexicon, dict):
lexicon = {"terms": [], "aliases": {}}
terms = list(lexicon.get("terms") or [])
aliases = dict(lexicon.get("aliases") or {})
hardware = summary.get("hardware") if isinstance(summary.get("hardware"), dict) else {}
hardware_map = {
"rpi5": "Raspberry Pi 5 nodes",
"rpi4": "Raspberry Pi 4 nodes",
"rpi": "Raspberry Pi nodes",
"jetson": "NVIDIA Jetson nodes",
"amd64": "AMD64 nodes",
}
existing_terms = {entry.get("term") for entry in terms if isinstance(entry, dict)}
for key, meaning in hardware_map.items():
if key not in hardware:
continue
if key not in existing_terms:
terms.append({"term": key, "meaning": meaning})
if key not in aliases:
aliases[key] = meaning
if "raspberry pi 5" not in aliases and "rpi5" in hardware:
aliases["raspberry pi 5"] = "rpi5"
if "raspberry pi 4" not in aliases and "rpi4" in hardware:
aliases["raspberry pi 4"] = "rpi4"
lexicon["terms"] = terms
lexicon["aliases"] = aliases
summary["lexicon"] = lexicon
def _nodes_detail(snapshot: dict[str, Any]) -> list[dict[str, Any]]:
items = snapshot.get("nodes_detail")
return items if isinstance(items, list) else []
def _metrics(snapshot: dict[str, Any]) -> dict[str, Any]:
metrics = snapshot.get("metrics")
return metrics if isinstance(metrics, dict) else {}
def _build_nodes(snapshot: dict[str, Any]) -> dict[str, Any]:
nodes_summary = snapshot.get("nodes_summary") if isinstance(snapshot.get("nodes_summary"), dict) else {}
if not nodes_summary:
return {}
return {
"nodes": {
"total": nodes_summary.get("total"),
"ready": nodes_summary.get("ready"),
"not_ready": nodes_summary.get("not_ready"),
}
}
def _build_pressure(snapshot: dict[str, Any]) -> dict[str, Any]:
nodes_summary = snapshot.get("nodes_summary") if isinstance(snapshot.get("nodes_summary"), dict) else {}
pressure = nodes_summary.get("pressure_nodes") if isinstance(nodes_summary.get("pressure_nodes"), dict) else {}
if not pressure:
return {}
return {"pressure_nodes": pressure}
def _build_hardware(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
hardware: dict[str, list[str]] = {}
for node in nodes_detail or []:
if not isinstance(node, dict):
continue
name = node.get("name")
hardware_class = node.get("hardware") or "unknown"
if name:
hardware.setdefault(hardware_class, []).append(name)
if not hardware:
return {}
return {"hardware": {key: sorted(value) for key, value in hardware.items()}}
def _build_hardware_by_node(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
mapping: dict[str, str] = {}
for node in nodes_detail or []:
if not isinstance(node, dict):
continue
name = node.get("name")
if isinstance(name, str) and name:
hardware = node.get("hardware") or "unknown"
mapping[name] = str(hardware)
return {"hardware_by_node": mapping} if mapping else {}
def _build_hardware_usage(metrics: dict[str, Any], hardware_by_node: dict[str, Any] | None) -> dict[str, Any]: # noqa: C901
if not isinstance(hardware_by_node, dict) or not hardware_by_node:
return {}
node_load = metrics.get("node_load") if isinstance(metrics.get("node_load"), list) else []
if not node_load:
return {}
buckets: dict[str, dict[str, list[float]]] = {}
for entry in node_load:
if not isinstance(entry, dict):
continue
node = entry.get("node")
if not isinstance(node, str) or not node:
continue
hardware = hardware_by_node.get(node, "unknown")
bucket = buckets.setdefault(str(hardware), {"load_index": [], "cpu": [], "ram": [], "net": [], "io": []})
for key in ("load_index", "cpu", "ram", "net", "io"):
value = entry.get(key)
if isinstance(value, (int, float)):
bucket[key].append(float(value))
output: list[dict[str, Any]] = []
for hardware, metrics_bucket in buckets.items():
row: dict[str, Any] = {"hardware": hardware}
for key, values in metrics_bucket.items():
if values:
row[key] = sum(values) / len(values)
output.append(row)
output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("hardware") or ""))
return {"hardware_usage_avg": output}
def _build_node_ages(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
ages: list[dict[str, Any]] = []
for node in nodes_detail or []:
if not isinstance(node, dict):
continue
name = node.get("name")
age = node.get("age_hours")
if name and isinstance(age, (int, float)):
ages.append({"name": name, "age_hours": age})
ages.sort(key=lambda item: -(item.get("age_hours") or 0))
return {"node_ages": ages[:5]} if ages else {}
def _count_values(nodes_detail: list[dict[str, Any]], key: str) -> dict[str, int]:
counts: dict[str, int] = {}
for node in nodes_detail or []:
if not isinstance(node, dict):
continue
value = node.get(key)
if isinstance(value, str) and value:
counts[value] = counts.get(value, 0) + 1
return counts
def _build_node_facts(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
if not nodes_detail:
return {}
role_counts: dict[str, int] = {}
for node in nodes_detail:
if not isinstance(node, dict):
continue
if node.get("is_worker"):
role_counts["worker"] = role_counts.get("worker", 0) + 1
roles = node.get("roles")
if isinstance(roles, list):
for role in roles:
if isinstance(role, str) and role:
role_counts[role] = role_counts.get(role, 0) + 1
return {
"node_arch_counts": _count_values(nodes_detail, "arch"),
"node_os_counts": _count_values(nodes_detail, "os"),
"node_kubelet_versions": _count_values(nodes_detail, "kubelet"),
"node_kernel_versions": _count_values(nodes_detail, "kernel"),
"node_runtime_versions": _count_values(nodes_detail, "container_runtime"),
"node_role_counts": role_counts,
}
def _build_node_taints(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
taints: dict[str, list[str]] = {}
for node in nodes_detail or []:
if not isinstance(node, dict):
continue
name = node.get("name")
if not name:
continue
entries = node.get("taints") if isinstance(node.get("taints"), list) else []
for entry in entries:
if not isinstance(entry, dict):
continue
key = entry.get("key")
effect = entry.get("effect")
if isinstance(key, str) and isinstance(effect, str):
label = f"{key}:{effect}"
taints.setdefault(label, []).append(name)
if not taints:
return {}
return {"node_taints": {key: sorted(names) for key, names in taints.items()}}
def _build_root_disk_headroom(metrics: dict[str, Any]) -> dict[str, Any]:
node_usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
disk = node_usage.get("disk") if isinstance(node_usage.get("disk"), list) else []
if not disk:
return {}
entries = []
for entry in disk:
if not isinstance(entry, dict):
continue
node = entry.get("node")
try:
used_pct = float(entry.get("value"))
except (TypeError, ValueError):
continue
headroom = max(0.0, 100.0 - used_pct)
if node:
entries.append({"node": node, "headroom_pct": headroom, "used_pct": used_pct})
entries.sort(key=lambda item: (item.get("headroom_pct") or 0.0, item.get("node") or ""))
return {"root_disk_low_headroom": entries[:5]} if entries else {}
def _build_longhorn(snapshot: dict[str, Any]) -> dict[str, Any]:
longhorn = snapshot.get("longhorn")
return {"longhorn": longhorn} if isinstance(longhorn, dict) and longhorn else {}
def _build_node_load(metrics: dict[str, Any]) -> dict[str, Any]:
node_load = metrics.get("node_load")
if not isinstance(node_load, list) or not node_load:
return {}
return {"node_load": node_load}
def _build_pods(metrics: dict[str, Any]) -> dict[str, Any]:
pods = {
"running": metrics.get("pods_running"),
"pending": metrics.get("pods_pending"),
"failed": metrics.get("pods_failed"),
"succeeded": metrics.get("pods_succeeded"),
}
if not any(value is not None for value in pods.values()):
return {}
return {"pods": pods}
def _build_capacity(metrics: dict[str, Any]) -> dict[str, Any]:
if not metrics:
return {}
capacity = {
"cpu": metrics.get("capacity_cpu"),
"allocatable_cpu": metrics.get("allocatable_cpu"),
"mem_bytes": metrics.get("capacity_mem_bytes"),
"allocatable_mem_bytes": metrics.get("allocatable_mem_bytes"),
"pods": metrics.get("capacity_pods"),
"allocatable_pods": metrics.get("allocatable_pods"),
}
if not any(value is not None for value in capacity.values()):
return {}
return {"capacity": capacity}
def _build_namespace_pods(snapshot: dict[str, Any]) -> dict[str, Any]:
namespaces = snapshot.get("namespace_pods")
if not isinstance(namespaces, list) or not namespaces:
return {}
return {"namespace_pods": namespaces}
def _build_namespace_nodes(snapshot: dict[str, Any]) -> dict[str, Any]:
namespace_nodes = snapshot.get("namespace_nodes")
if not isinstance(namespace_nodes, list) or not namespace_nodes:
return {}
return {"namespace_nodes": namespace_nodes}
def _build_node_pods(snapshot: dict[str, Any]) -> dict[str, Any]:
node_pods = snapshot.get("node_pods")
if not isinstance(node_pods, list) or not node_pods:
return {}
return {"node_pods": node_pods}
def _build_node_pods_top(metrics: dict[str, Any]) -> dict[str, Any]:
top = metrics.get("node_pods_top")
if not isinstance(top, list) or not top:
return {}
return {"node_pods_top": top}
def _build_pod_issues(snapshot: dict[str, Any]) -> dict[str, Any]:
pod_issues = snapshot.get("pod_issues")
if not isinstance(pod_issues, dict) or not pod_issues:
return {}
return {"pod_issues": pod_issues}
def _build_workload_health(snapshot: dict[str, Any]) -> dict[str, Any]:
health = snapshot.get("workloads_health")
if not isinstance(health, dict) or not health:
return {}
deployments = health.get("deployments")
statefulsets = health.get("statefulsets")
daemonsets = health.get("daemonsets")
if not isinstance(deployments, dict) or not isinstance(statefulsets, dict) or not isinstance(daemonsets, dict):
return {}
return {
"workloads_health": {
"deployments": deployments,
"statefulsets": statefulsets,
"daemonsets": daemonsets,
}
}
def _build_events(snapshot: dict[str, Any]) -> dict[str, Any]:
events = snapshot.get("events")
if not isinstance(events, dict) or not events:
return {}
return {"events": events}
def _build_event_summary(snapshot: dict[str, Any]) -> dict[str, Any]:
events = snapshot.get("events")
if not isinstance(events, dict) or not events:
return {}
summary = {}
if isinstance(events.get("warnings_top_reason"), dict):
summary["warnings_top_reason"] = events.get("warnings_top_reason")
if events.get("warnings_latest"):
summary["warnings_latest"] = events.get("warnings_latest")
return {"event_summary": summary} if summary else {}
def _build_postgres(metrics: dict[str, Any]) -> dict[str, Any]:
postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
if not postgres:
return {}
return {
"postgres": {
"used": postgres.get("used"),
"max": postgres.get("max"),
"hottest_db": postgres.get("hottest_db"),
"by_db": postgres.get("by_db"),
}
}

View File

@ -0,0 +1,57 @@
from __future__ import annotations
from typing import Any
from .core_a import _node_usage_top
def _build_hottest(metrics: dict[str, Any]) -> dict[str, Any]:
node_usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
hottest: dict[str, Any] = {}
for key in ("cpu", "ram", "net", "io", "disk"):
top = _node_usage_top(node_usage.get(key, []))
if top:
hottest[key] = top
if not hottest:
return {}
return {"hottest": hottest}
def _build_pvc(metrics: dict[str, Any]) -> dict[str, Any]:
pvc_usage = metrics.get("pvc_usage_top") if isinstance(metrics.get("pvc_usage_top"), list) else []
if not pvc_usage:
return {}
return {"pvc_usage_top": pvc_usage}
def _build_namespace_capacity(metrics: dict[str, Any]) -> dict[str, Any]:
capacity = metrics.get("namespace_capacity")
if not isinstance(capacity, list) or not capacity:
return {}
return {"namespace_capacity": capacity}
def _build_namespace_capacity_summary(metrics: dict[str, Any]) -> dict[str, Any]:
summary = metrics.get("namespace_capacity_summary")
if not isinstance(summary, dict) or not summary:
return {}
return {"namespace_capacity_summary": summary}
def _build_node_load_summary(metrics: dict[str, Any]) -> dict[str, Any]:
summary = metrics.get("node_load_summary")
if not isinstance(summary, dict) or not summary:
return {}
return {"node_load_summary": summary}
def _build_workloads(snapshot: dict[str, Any]) -> dict[str, Any]:
workloads = snapshot.get("workloads") if isinstance(snapshot.get("workloads"), list) else []
return {"workloads": workloads}
def _build_flux(snapshot: dict[str, Any]) -> dict[str, Any]:
flux = snapshot.get("flux") if isinstance(snapshot.get("flux"), dict) else {}
return {"flux": flux}
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,497 @@
from __future__ import annotations
from typing import Any
from .core_a import _BYTES_GB, _BYTES_KB, _BYTES_MB
from .core_b import *
def _format_float(value: Any) -> str:
try:
numeric = float(value)
except (TypeError, ValueError):
return str(value)
return f"{numeric:.2f}".rstrip("0").rstrip(".")
def _format_rate_bytes(value: Any) -> str:
try:
numeric = float(value)
except (TypeError, ValueError):
return str(value)
if numeric >= _BYTES_MB:
return f"{numeric / _BYTES_MB:.2f} MB/s"
if numeric >= _BYTES_KB:
return f"{numeric / _BYTES_KB:.2f} KB/s"
return f"{numeric:.2f} B/s"
def _format_bytes(value: Any) -> str:
try:
numeric = float(value)
except (TypeError, ValueError):
return str(value)
if numeric >= _BYTES_GB:
return f"{numeric / _BYTES_GB:.2f} GB"
if numeric >= _BYTES_MB:
return f"{numeric / _BYTES_MB:.2f} MB"
if numeric >= _BYTES_KB:
return f"{numeric / _BYTES_KB:.2f} KB"
return f"{numeric:.2f} B"
def _format_kv_map(values: dict[str, Any]) -> str:
parts = []
for key, value in values.items():
parts.append(f"{key}={value}")
return ", ".join(parts)
def _format_names(names: list[str]) -> str:
if not names:
return ""
return ", ".join(sorted(names))
def _append_nodes(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901
nodes = summary.get("nodes") if isinstance(summary.get("nodes"), dict) else {}
if not nodes:
return
workers = {}
if isinstance(summary.get("nodes_summary"), dict):
workers = summary["nodes_summary"].get("workers") or {}
workers_total = workers.get("total")
workers_ready = workers.get("ready")
workers_str = ""
if workers_total is not None and workers_ready is not None:
workers_str = f", workers_ready={workers_ready}/{workers_total}"
total = nodes.get("total")
ready = nodes.get("ready")
not_ready = nodes.get("not_ready")
if not_ready is None:
not_ready = 0
lines.append(f"nodes: total={total}, ready={ready}, not_ready={not_ready}{workers_str}")
if total is not None:
lines.append(f"nodes_total: {total}")
if ready is not None:
lines.append(f"nodes_ready: {ready}")
if not_ready is not None:
lines.append(f"nodes_not_ready_count: {not_ready}")
if not isinstance(summary.get("nodes_summary"), dict):
return
not_ready_names = summary["nodes_summary"].get("not_ready_names") or []
if not_ready_names:
lines.append("nodes_not_ready: " + _format_names(not_ready_names))
by_arch = summary["nodes_summary"].get("by_arch") or {}
if isinstance(by_arch, dict) and by_arch:
lines.append("archs: " + _format_kv_map(by_arch))
by_role = summary["nodes_summary"].get("by_role") or {}
if isinstance(by_role, dict) and by_role:
lines.append("roles: " + _format_kv_map(by_role))
def _append_hardware(lines: list[str], summary: dict[str, Any]) -> None:
hardware = summary.get("hardware") if isinstance(summary.get("hardware"), dict) else {}
if not hardware:
return
parts = []
for key, names in hardware.items():
if not isinstance(names, list):
continue
label = f"{key}={len(names)}"
name_list = _format_names([str(name) for name in names if name])
if name_list:
label = f"{label} ({name_list})"
parts.append(label)
if parts:
lines.append("hardware: " + "; ".join(sorted(parts)))
def _append_hardware_groups(lines: list[str], summary: dict[str, Any]) -> None:
hardware = summary.get("hardware") if isinstance(summary.get("hardware"), dict) else {}
if not hardware:
return
parts = []
for key, names in hardware.items():
if not isinstance(names, list):
continue
name_list = _format_names([str(name) for name in names if name])
if name_list:
parts.append(f"{key}={name_list}")
if parts:
lines.append("hardware_nodes: " + "; ".join(sorted(parts)))
def _append_node_ages(lines: list[str], summary: dict[str, Any]) -> None:
ages = summary.get("node_ages") if isinstance(summary.get("node_ages"), list) else []
if not ages:
return
parts = []
for entry in ages[:3]:
if not isinstance(entry, dict):
continue
name = entry.get("name")
age = entry.get("age_hours")
if name and isinstance(age, (int, float)):
parts.append(f"{name}={_format_float(age)}h")
if parts:
lines.append("node_age_top: " + "; ".join(parts))
def _append_node_taints(lines: list[str], summary: dict[str, Any]) -> None:
taints = summary.get("node_taints") if isinstance(summary.get("node_taints"), dict) else {}
if not taints:
return
parts = []
for key, names in taints.items():
if not isinstance(names, list):
continue
name_list = _format_names([str(name) for name in names if name])
parts.append(f"{key}={len(names)} ({name_list})" if name_list else f"{key}={len(names)}")
if parts:
lines.append("node_taints: " + "; ".join(sorted(parts)))
def _append_node_facts(lines: list[str], summary: dict[str, Any]) -> None:
def top_counts(label: str, counts: dict[str, int], limit: int = 4) -> None:
if not counts:
return
top = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:limit]
rendered = "; ".join([f"{name}={count}" for name, count in top])
if rendered:
lines.append(f"{label}: {rendered}")
top_counts("node_arch", summary.get("node_arch_counts") or {})
top_counts("node_os", summary.get("node_os_counts") or {})
top_counts("node_kubelet_versions", summary.get("node_kubelet_versions") or {})
top_counts("node_kernel_versions", summary.get("node_kernel_versions") or {})
top_counts("node_runtime_versions", summary.get("node_runtime_versions") or {})
top_counts("node_roles", summary.get("node_role_counts") or {})
def _append_pressure(lines: list[str], summary: dict[str, Any]) -> None:
pressure = summary.get("pressure_nodes")
if not isinstance(pressure, dict) or not pressure:
return
parts = []
for cond, nodes in sorted(pressure.items()):
if not nodes:
continue
name_list = _format_names([str(name) for name in nodes if name])
parts.append(f"{cond}={len(nodes)} ({name_list})" if name_list else f"{cond}={len(nodes)}")
if parts:
lines.append("node_pressure: " + "; ".join(parts))
def _append_pods(lines: list[str], summary: dict[str, Any]) -> None:
pods = summary.get("pods") if isinstance(summary.get("pods"), dict) else {}
if not pods:
return
lines.append(
"pods: running={running}, pending={pending}, failed={failed}, succeeded={succeeded}".format(
running=pods.get("running"),
pending=pods.get("pending"),
failed=pods.get("failed"),
succeeded=pods.get("succeeded"),
)
)
def _append_capacity(lines: list[str], summary: dict[str, Any]) -> None:
capacity = summary.get("capacity") if isinstance(summary.get("capacity"), dict) else {}
if not capacity:
return
parts = []
if capacity.get("cpu") is not None:
parts.append(f"cpu={_format_float(capacity.get('cpu'))}")
if capacity.get("allocatable_cpu") is not None:
parts.append(f"alloc_cpu={_format_float(capacity.get('allocatable_cpu'))}")
if capacity.get("mem_bytes") is not None:
parts.append(f"mem={_format_bytes(capacity.get('mem_bytes'))}")
if capacity.get("allocatable_mem_bytes") is not None:
parts.append(f"alloc_mem={_format_bytes(capacity.get('allocatable_mem_bytes'))}")
if capacity.get("pods") is not None:
parts.append(f"pods={_format_float(capacity.get('pods'))}")
if capacity.get("allocatable_pods") is not None:
parts.append(f"alloc_pods={_format_float(capacity.get('allocatable_pods'))}")
if parts:
lines.append("capacity: " + "; ".join(parts))
def _append_namespace_pods(lines: list[str], summary: dict[str, Any]) -> None:
namespaces = summary.get("namespace_pods")
if not isinstance(namespaces, list) or not namespaces:
return
top = sorted(
(item for item in namespaces if isinstance(item, dict)),
key=lambda item: (-int(item.get("pods_total") or 0), item.get("namespace") or ""),
)[:8]
parts = []
for item in top:
name = item.get("namespace")
total = item.get("pods_total")
running = item.get("pods_running")
if not name:
continue
label = f"{name}={total}"
if running is not None:
label = f"{label} (running={running})"
parts.append(label)
if parts:
lines.append("namespaces_top: " + "; ".join(parts))
def _append_namespace_nodes(lines: list[str], summary: dict[str, Any]) -> None:
namespace_nodes = summary.get("namespace_nodes")
if not isinstance(namespace_nodes, list) or not namespace_nodes:
return
top = sorted(
(item for item in namespace_nodes if isinstance(item, dict)),
key=lambda item: (-int(item.get("pods_total") or 0), item.get("namespace") or ""),
)[:8]
parts = []
for item in top:
namespace = item.get("namespace")
pods_total = item.get("pods_total")
primary = item.get("primary_node")
if namespace:
label = f"{namespace}={pods_total}"
if primary:
label = f"{label} (primary={primary})"
parts.append(label)
if parts:
lines.append("namespace_nodes_top: " + "; ".join(parts))
def _append_node_pods(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901
node_pods = summary.get("node_pods")
if not isinstance(node_pods, list) or not node_pods:
return
sortable: list[dict[str, Any]] = []
for item in node_pods:
if not isinstance(item, dict):
continue
try:
pods_value = int(item.get("pods_total") or 0)
except (TypeError, ValueError):
continue
sortable.append({**item, "pods_total": pods_value})
top = sorted(sortable, key=lambda item: (-int(item.get("pods_total") or 0), item.get("node") or ""))[:8]
max_entry = None
for entry in node_pods:
if not isinstance(entry, dict):
continue
pods_total = entry.get("pods_total")
try:
pods_value = int(pods_total)
except (TypeError, ValueError):
continue
if max_entry is None or pods_value > max_entry["pods_total"]:
max_entry = {
"node": entry.get("node"),
"pods_total": pods_value,
"namespaces_top": entry.get("namespaces_top") or [],
}
parts = []
for item in top:
node = item.get("node")
pods_total = item.get("pods_total")
namespaces = item.get("namespaces_top") or []
ns_label = ""
if namespaces:
ns_label = ", ".join([f"{name}={count}" for name, count in namespaces])
if node:
label = f"{node}={pods_total}"
if ns_label:
label = f"{label} ({ns_label})"
parts.append(label)
if parts:
lines.append("node_pods_top: " + "; ".join(parts))
if max_entry and isinstance(max_entry.get("node"), str):
ns_label = ""
namespaces = max_entry.get("namespaces_top") or []
if namespaces:
ns_label = ", ".join([f"{name}={count}" for name, count in namespaces])
label = f"{max_entry.get('node')}={max_entry.get('pods_total')}"
if ns_label:
label = f"{label} ({ns_label})"
lines.append("node_pods_max: " + label)
for item in top:
node = item.get("node")
namespaces = item.get("namespaces_top") or []
if not node or not namespaces:
continue
ns_label = ", ".join([f"{name}={count}" for name, count in namespaces])
lines.append(f"node_namespaces_top: {node} ({ns_label})")
def _append_pod_issues(lines: list[str], summary: dict[str, Any]) -> None:
pod_issues = summary.get("pod_issues") if isinstance(summary.get("pod_issues"), dict) else {}
if not pod_issues:
return
counts_line = _format_pod_issue_counts(pod_issues)
if counts_line:
lines.append(counts_line)
top_line = _format_pod_issue_top(pod_issues)
if top_line:
lines.append(top_line)
pending_line = _format_pod_pending_oldest(pod_issues)
if pending_line:
lines.append(pending_line)
pending_over_line = _format_pod_pending_over_15m(pod_issues)
if pending_over_line:
lines.append(pending_over_line)
reasons_line = _format_pod_waiting_reasons(pod_issues)
if reasons_line:
lines.append(reasons_line)
def _format_pod_issue_counts(pod_issues: dict[str, Any]) -> str:
counts = pod_issues.get("counts") if isinstance(pod_issues.get("counts"), dict) else {}
if not counts:
return ""
parts = []
for key in ("Failed", "Pending", "Unknown"):
if key in counts:
parts.append(f"{key}={counts.get(key)}")
return "pod_issues: " + "; ".join(parts) if parts else ""
def _format_pod_issue_top(pod_issues: dict[str, Any]) -> str:
items = pod_issues.get("items") if isinstance(pod_issues.get("items"), list) else []
if not items:
return ""
top = []
for item in items[:5]:
if not isinstance(item, dict):
continue
namespace = item.get("namespace")
pod = item.get("pod")
if not namespace or not pod:
continue
phase = item.get("phase") or ""
restarts = item.get("restarts") or 0
top.append(f"{namespace}/{pod}({phase},r={restarts})")
return "pod_issues_top: " + "; ".join(top) if top else ""
def _format_pod_pending_oldest(pod_issues: dict[str, Any]) -> str:
pending = pod_issues.get("pending_oldest") if isinstance(pod_issues.get("pending_oldest"), list) else []
if not pending:
return ""
parts = []
for item in pending[:5]:
if not isinstance(item, dict):
continue
namespace = item.get("namespace")
pod = item.get("pod")
age = item.get("age_hours")
reason = item.get("reason") or ""
if namespace and pod and age is not None:
label = f"{namespace}/{pod}={_format_float(age)}h"
if reason:
label = f"{label} ({reason})"
parts.append(label)
return "pods_pending_oldest: " + "; ".join(parts) if parts else ""
def _format_pod_waiting_reasons(pod_issues: dict[str, Any]) -> str:
reasons = pod_issues.get("waiting_reasons") if isinstance(pod_issues.get("waiting_reasons"), dict) else {}
if not reasons:
return ""
pairs = sorted(reasons.items(), key=lambda item: (-item[1], item[0]))[:5]
return "pod_waiting_reasons: " + "; ".join([f"{key}={val}" for key, val in pairs])
def _format_pod_pending_over_15m(pod_issues: dict[str, Any]) -> str:
count = pod_issues.get("pending_over_15m")
if count is None:
return ""
try:
count_val = int(count)
except (TypeError, ValueError):
return ""
return f"pods_pending_over_15m: {count_val}"
def _append_workload_health(lines: list[str], summary: dict[str, Any]) -> None:
health = summary.get("workloads_health") if isinstance(summary.get("workloads_health"), dict) else {}
if not health:
return
deployments = health.get("deployments") if isinstance(health.get("deployments"), dict) else {}
statefulsets = health.get("statefulsets") if isinstance(health.get("statefulsets"), dict) else {}
daemonsets = health.get("daemonsets") if isinstance(health.get("daemonsets"), dict) else {}
total_not_ready = 0
for entry in (deployments, statefulsets, daemonsets):
total_not_ready += int(entry.get("not_ready") or 0)
lines.append(
"workloads_not_ready: "
f"deployments={deployments.get('not_ready', 0)}, "
f"statefulsets={statefulsets.get('not_ready', 0)}, "
f"daemonsets={daemonsets.get('not_ready', 0)} "
f"(total={total_not_ready})"
)
def _append_node_usage_stats(lines: list[str], summary: dict[str, Any]) -> None:
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
stats = metrics.get("node_usage_stats") if isinstance(metrics.get("node_usage_stats"), dict) else {}
if not stats:
return
parts = []
for key in ("cpu", "ram", "net", "io", "disk"):
entry = stats.get(key) if isinstance(stats.get(key), dict) else {}
avg = entry.get("avg")
if avg is None:
continue
value = _format_rate_bytes(avg) if key in {"net", "io"} else _format_float(avg)
parts.append(f"{key}={value}")
if parts:
lines.append("node_usage_avg: " + "; ".join(parts))
def _append_events(lines: list[str], summary: dict[str, Any]) -> None:
events = summary.get("events") if isinstance(summary.get("events"), dict) else {}
if not events:
return
total = events.get("warnings_total")
by_reason = events.get("warnings_by_reason") if isinstance(events.get("warnings_by_reason"), dict) else {}
if total is None:
return
if by_reason:
top = sorted(by_reason.items(), key=lambda item: (-item[1], item[0]))[:3]
reasons = "; ".join([f"{reason}={count}" for reason, count in top])
lines.append(f"warnings: total={total}; top={reasons}")
else:
lines.append(f"warnings: total={total}")
def _append_pvc_usage(lines: list[str], summary: dict[str, Any]) -> None:
pvc_usage = summary.get("pvc_usage_top")
if not isinstance(pvc_usage, list) or not pvc_usage:
return
parts = []
for entry in pvc_usage:
if not isinstance(entry, dict):
continue
metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {}
namespace = metric.get("namespace")
pvc = metric.get("persistentvolumeclaim")
value = entry.get("value")
if namespace and pvc:
parts.append(f"{namespace}/{pvc}={_format_float(value)}%")
if parts:
lines.append("pvc_usage_top: " + "; ".join(parts))
def _append_root_disk_headroom(lines: list[str], summary: dict[str, Any]) -> None:
headroom = summary.get("root_disk_low_headroom")
if not isinstance(headroom, list) or not headroom:
return
parts = []
for entry in headroom:
if not isinstance(entry, dict):
continue
node = entry.get("node")
headroom_pct = entry.get("headroom_pct")
if node and headroom_pct is not None:
parts.append(f"{node}={_format_float(headroom_pct)}%")
if parts:
lines.append("root_disk_low_headroom: " + "; ".join(parts))
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,435 @@
from __future__ import annotations
from typing import Any
from .core_a import _VALUE_PAIR_LEN
from .format_a import *
def _append_namespace_metric_series(
lines: list[str],
label: str,
entries: list[Any],
formatter: Any,
) -> None:
parts = []
for entry in entries:
if not isinstance(entry, dict):
continue
metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {}
namespace = metric.get("namespace")
value = entry.get("value")
if namespace:
parts.append(f"{namespace}={formatter(value)}")
if parts:
lines.append(f"{label}: " + "; ".join(parts))
def _append_longhorn(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901
longhorn = summary.get("longhorn") if isinstance(summary.get("longhorn"), dict) else {}
if not longhorn:
return
total = longhorn.get("total")
attached = longhorn.get("attached_count")
detached = longhorn.get("detached_count")
degraded = longhorn.get("degraded_count")
by_state = longhorn.get("by_state") if isinstance(longhorn.get("by_state"), dict) else {}
by_robust = longhorn.get("by_robustness") if isinstance(longhorn.get("by_robustness"), dict) else {}
if total is not None:
if attached is None and detached is None and degraded is None:
unhealthy = longhorn.get("unhealthy_count")
lines.append(f"longhorn: total={total}, unhealthy={unhealthy if unhealthy is not None else 0}")
else:
lines.append(
f"longhorn: total={total}, attached={attached if attached is not None else 0}, "
f"detached={detached if detached is not None else 0}, "
f"degraded={degraded if degraded is not None else 0}"
)
if by_state:
lines.append("longhorn_state: " + _format_kv_map(by_state))
if by_robust:
lines.append("longhorn_robustness: " + _format_kv_map(by_robust))
unhealthy_items = longhorn.get("unhealthy")
if isinstance(unhealthy_items, list) and unhealthy_items:
parts = []
for entry in unhealthy_items[:5]:
if not isinstance(entry, dict):
continue
name = entry.get("name")
state = entry.get("state")
robustness = entry.get("robustness")
if name:
label = name
if state or robustness:
label = f"{label}({state},{robustness})"
parts.append(label)
if parts:
lines.append("longhorn_unhealthy_top: " + "; ".join(parts))
def _append_namespace_usage(lines: list[str], summary: dict[str, Any]) -> None:
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
cpu_top = metrics.get("namespace_cpu_top") if isinstance(metrics.get("namespace_cpu_top"), list) else []
mem_top = metrics.get("namespace_mem_top") if isinstance(metrics.get("namespace_mem_top"), list) else []
_append_namespace_metric_series(lines, "namespace_cpu_top", cpu_top, _format_float)
_append_namespace_metric_series(lines, "namespace_mem_top", mem_top, _format_bytes)
def _append_namespace_requests(lines: list[str], summary: dict[str, Any]) -> None:
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
cpu_req = metrics.get("namespace_cpu_requests_top") if isinstance(metrics.get("namespace_cpu_requests_top"), list) else []
mem_req = metrics.get("namespace_mem_requests_top") if isinstance(metrics.get("namespace_mem_requests_top"), list) else []
_append_namespace_metric_series(lines, "namespace_cpu_requests_top", cpu_req, _format_float)
_append_namespace_metric_series(lines, "namespace_mem_requests_top", mem_req, _format_bytes)
def _append_namespace_io_net(lines: list[str], summary: dict[str, Any]) -> None:
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
net_top = metrics.get("namespace_net_top") if isinstance(metrics.get("namespace_net_top"), list) else []
io_top = metrics.get("namespace_io_top") if isinstance(metrics.get("namespace_io_top"), list) else []
_append_namespace_metric_series(lines, "namespace_net_top", net_top, _format_rate_bytes)
_append_namespace_metric_series(lines, "namespace_io_top", io_top, _format_rate_bytes)
def _append_pod_usage(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
cpu_top = metrics.get("pod_cpu_top") if isinstance(metrics.get("pod_cpu_top"), list) else []
cpu_top_node = (
metrics.get("pod_cpu_top_node")
if isinstance(metrics.get("pod_cpu_top_node"), list)
else []
)
mem_top = metrics.get("pod_mem_top") if isinstance(metrics.get("pod_mem_top"), list) else []
mem_top_node = (
metrics.get("pod_mem_top_node")
if isinstance(metrics.get("pod_mem_top_node"), list)
else []
)
if cpu_top:
parts = []
for entry in cpu_top:
if not isinstance(entry, dict):
continue
metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {}
namespace = metric.get("namespace")
pod = metric.get("pod")
value = entry.get("value")
if namespace and pod and value is not None:
parts.append(f"{namespace}/{pod}={_format_float(value)}")
if parts:
lines.append("pod_cpu_top: " + "; ".join(parts))
if cpu_top_node:
parts = []
for entry in cpu_top_node:
if not isinstance(entry, dict):
continue
metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {}
namespace = metric.get("namespace")
pod = metric.get("pod")
node = metric.get("node")
value = entry.get("value")
if namespace and pod and node and value is not None:
parts.append(f"{node}:{namespace}/{pod}={_format_float(value)}")
if parts:
lines.append("pod_cpu_top_node: " + "; ".join(parts))
if mem_top:
parts = []
for entry in mem_top:
if not isinstance(entry, dict):
continue
metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {}
namespace = metric.get("namespace")
pod = metric.get("pod")
value = entry.get("value")
if namespace and pod and value is not None:
parts.append(f"{namespace}/{pod}={_format_bytes(value)}")
if parts:
lines.append("pod_mem_top: " + "; ".join(parts))
if mem_top_node:
parts = []
for entry in mem_top_node:
if not isinstance(entry, dict):
continue
metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {}
namespace = metric.get("namespace")
pod = metric.get("pod")
node = metric.get("node")
value = entry.get("value")
if namespace and pod and node and value is not None:
parts.append(f"{node}:{namespace}/{pod}={_format_bytes(value)}")
if parts:
lines.append("pod_mem_top_node: " + "; ".join(parts))
def _append_restarts(lines: list[str], summary: dict[str, Any]) -> None:
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
top_restarts = metrics.get("top_restarts_1h") or []
if not isinstance(top_restarts, list) or not top_restarts:
top_restarts = []
parts = []
for entry in top_restarts:
metric = entry.get("metric") if isinstance(entry, dict) else {}
value = entry.get("value") if isinstance(entry, dict) else []
if not isinstance(metric, dict) or not isinstance(value, list) or len(value) < _VALUE_PAIR_LEN:
continue
namespace = metric.get("namespace")
pod = metric.get("pod")
count = _format_float(value[1])
if namespace and pod:
parts.append(f"{namespace}/{pod}={count}")
if parts:
lines.append("restarts_1h_top: " + "; ".join(parts))
else:
lines.append("restarts_1h_top: none")
ns_top = metrics.get("restart_namespace_top") or []
if isinstance(ns_top, list) and ns_top:
ns_parts = []
for entry in ns_top:
metric = entry.get("metric") if isinstance(entry, dict) else {}
value = entry.get("value")
namespace = metric.get("namespace") if isinstance(metric, dict) else None
if namespace and value is not None:
ns_parts.append(f"{namespace}={_format_float(value)}")
if ns_parts:
lines.append("restarts_1h_namespace_top: " + "; ".join(ns_parts))
else:
lines.append("restarts_1h_namespace_top: none")
def _append_job_failures(lines: list[str], summary: dict[str, Any]) -> None:
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
failures = metrics.get("job_failures_24h") if isinstance(metrics.get("job_failures_24h"), list) else []
if not failures:
return
parts = []
for entry in failures:
if not isinstance(entry, dict):
continue
metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {}
namespace = metric.get("namespace")
job_name = metric.get("job_name") or metric.get("job")
value = entry.get("value")
if namespace and job_name and value is not None:
parts.append(f"{namespace}/{job_name}={_format_float(value)}")
if parts:
lines.append("job_failures_24h: " + "; ".join(parts))
def _append_jobs(lines: list[str], summary: dict[str, Any]) -> None:
jobs = summary.get("jobs") if isinstance(summary.get("jobs"), dict) else {}
if not jobs:
return
totals_line = _format_jobs_totals(jobs)
if totals_line:
lines.append(totals_line)
failing_line = _format_jobs_failing(jobs)
if failing_line:
lines.append(failing_line)
active_line = _format_jobs_active_oldest(jobs)
if active_line:
lines.append(active_line)
def _format_jobs_totals(jobs: dict[str, Any]) -> str:
totals = jobs.get("totals") if isinstance(jobs.get("totals"), dict) else {}
if not totals:
return ""
return "jobs: total={total}, active={active}, failed={failed}, succeeded={succeeded}".format(
total=totals.get("total"),
active=totals.get("active"),
failed=totals.get("failed"),
succeeded=totals.get("succeeded"),
)
def _format_jobs_failing(jobs: dict[str, Any]) -> str:
failing = jobs.get("failing") if isinstance(jobs.get("failing"), list) else []
if not failing:
return ""
parts = []
for item in failing[:5]:
if not isinstance(item, dict):
continue
namespace = item.get("namespace")
name = item.get("job")
failed = item.get("failed")
age = item.get("age_hours")
if namespace and name and failed is not None:
label = f"{namespace}/{name}={failed}"
if age is not None:
label = f"{label} ({_format_float(age)}h)"
parts.append(label)
return "jobs_failing_top: " + "; ".join(parts) if parts else ""
def _format_jobs_active_oldest(jobs: dict[str, Any]) -> str:
active_oldest = jobs.get("active_oldest") if isinstance(jobs.get("active_oldest"), list) else []
if not active_oldest:
return ""
parts = []
for item in active_oldest[:5]:
if not isinstance(item, dict):
continue
namespace = item.get("namespace")
name = item.get("job")
age = item.get("age_hours")
if namespace and name and age is not None:
parts.append(f"{namespace}/{name}={_format_float(age)}h")
return "jobs_active_oldest: " + "; ".join(parts) if parts else ""
def _append_postgres(lines: list[str], summary: dict[str, Any]) -> None:
postgres = summary.get("postgres") if isinstance(summary.get("postgres"), dict) else {}
if not postgres:
return
hottest = postgres.get("hottest_db") or ""
lines.append(
"postgres: used={used}, max={max}, hottest_db={hottest}".format(
used=postgres.get("used"),
max=postgres.get("max"),
hottest=hottest,
)
)
used = postgres.get("used")
max_conn = postgres.get("max")
if used is not None or max_conn is not None:
lines.append(f"postgres_connections_total: used={_format_float(used)}, max={_format_float(max_conn)}")
by_db = postgres.get("by_db")
if isinstance(by_db, list) and by_db:
parts = []
for entry in by_db:
if not isinstance(entry, dict):
continue
metric = entry.get("metric") if isinstance(entry.get("metric"), dict) else {}
value = entry.get("value")
if isinstance(value, list) and len(value) >= _VALUE_PAIR_LEN:
value = value[1]
name = metric.get("datname") if isinstance(metric, dict) else None
if name and value is not None:
parts.append(f"{name}={_format_float(value)}")
if parts:
lines.append("postgres_connections_by_db: " + "; ".join(parts))
def _append_hottest(lines: list[str], summary: dict[str, Any]) -> None:
hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {}
if not hottest:
return
hardware_map = summary.get("hardware_by_node")
if not isinstance(hardware_map, dict):
hardware_map = {}
parts = []
for key, entry in hottest.items():
if not isinstance(entry, dict):
continue
node = entry.get("node")
hardware = hardware_map.get(node) if node else None
if key in {"net", "io"}:
value = _format_rate_bytes(entry.get("value"))
else:
value = _format_float(entry.get("value"))
if value and key in {"cpu", "ram", "disk"}:
value = f"{value}%"
if node:
label = node
if hardware:
label = f"{label} [{hardware}]"
parts.append(f"{key}={label} ({value})")
if parts:
lines.append("hottest: " + "; ".join(parts))
def _append_workloads(lines: list[str], summary: dict[str, Any]) -> None:
workloads = summary.get("workloads")
if not isinstance(workloads, list) or not workloads:
return
lines.append(f"workloads: total={len(workloads)}")
top_workloads = sorted(
(item for item in workloads if isinstance(item, dict)),
key=lambda item: (-int(item.get("pods_total") or 0), item.get("workload") or ""),
)[:5]
if not top_workloads:
return
parts = []
for item in top_workloads:
namespace = item.get("namespace")
name = item.get("workload")
pods_total = item.get("pods_total")
primary = item.get("primary_node")
if namespace and name:
label = f"{namespace}/{name}={pods_total}"
if primary:
label = f"{label} (primary={primary})"
parts.append(label)
if parts:
lines.append("workloads_top: " + "; ".join(parts))
def _append_topology(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901
topology = summary.get("topology") if isinstance(summary.get("topology"), dict) else {}
if not topology:
return
nodes = topology.get("nodes") if isinstance(topology.get("nodes"), list) else []
workloads = topology.get("workloads") if isinstance(topology.get("workloads"), list) else []
if nodes:
parts = []
for entry in nodes[:5]:
if not isinstance(entry, dict):
continue
node = entry.get("node")
top = entry.get("workloads_top") if isinstance(entry.get("workloads_top"), list) else []
if not node or not top:
continue
items = ", ".join([f"{name}({count})" for name, count in top if name and count is not None])
if items:
parts.append(f"{node}={items}")
if parts:
lines.append("node_workloads_top: " + "; ".join(parts))
if workloads:
parts = []
for entry in workloads[:5]:
if not isinstance(entry, dict):
continue
namespace = entry.get("namespace")
name = entry.get("workload")
nodes_top = entry.get("nodes_top") if isinstance(entry.get("nodes_top"), list) else []
if not namespace or not name:
continue
nodes_label = ", ".join([f"{node}:{count}" for node, count in nodes_top if node])
label = f"{namespace}/{name}"
if nodes_label:
label = f"{label} [{nodes_label}]"
parts.append(label)
if parts:
lines.append("workload_nodes_top: " + "; ".join(parts))
def _append_flux(lines: list[str], summary: dict[str, Any]) -> None:
flux = summary.get("flux") if isinstance(summary.get("flux"), dict) else {}
if not flux:
return
not_ready = flux.get("not_ready")
if not_ready is not None:
lines.append(f"flux_not_ready: {not_ready}")
items = flux.get("items")
if isinstance(items, list) and items:
parts = []
for item in items[:10]:
if not isinstance(item, dict):
continue
name = item.get("name") or ""
namespace = item.get("namespace") or ""
reason = item.get("reason") or ""
suspended = item.get("suspended")
label = f"{namespace}/{name}".strip("/")
if reason:
label = f"{label} ({reason})"
if suspended:
label = f"{label} [suspended]"
if label:
parts.append(label)
if parts:
lines.append("flux_not_ready_items: " + "; ".join(parts))
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,448 @@
from __future__ import annotations
from typing import Any
from .core_a import PVC_USAGE_CRITICAL
from .format_b import *
def _append_signals(lines: list[str], summary: dict[str, Any]) -> None:
signals = summary.get("signals") if isinstance(summary.get("signals"), list) else []
if not signals:
return
lines.append("signals:")
for entry in signals[:8]:
if not isinstance(entry, dict):
continue
scope = entry.get("scope") or ""
target = entry.get("target") or ""
metric = entry.get("metric") or ""
current = entry.get("current")
delta = entry.get("delta_pct")
severity = entry.get("severity") or ""
detail = f"{scope}:{target} {metric}={current}"
if delta is not None:
detail += f" delta={delta}%"
if severity:
detail += f" severity={severity}"
lines.append(f"- {detail}")
def _append_profiles(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901
profiles = summary.get("profiles") if isinstance(summary.get("profiles"), dict) else {}
if not profiles:
return
nodes = profiles.get("nodes") if isinstance(profiles.get("nodes"), list) else []
namespaces = profiles.get("namespaces") if isinstance(profiles.get("namespaces"), list) else []
workloads = profiles.get("workloads") if isinstance(profiles.get("workloads"), list) else []
if nodes:
lines.append("node_profiles:")
for entry in nodes[:3]:
if not isinstance(entry, dict):
continue
lines.append(
f"- {entry.get('node')}: load={entry.get('load_index')} cpu={entry.get('cpu')} ram={entry.get('ram')} "
f"pods={entry.get('pods_total')} hw={entry.get('hardware')}"
)
if namespaces:
lines.append("namespace_profiles:")
for entry in namespaces[:3]:
if not isinstance(entry, dict):
continue
lines.append(
f"- {entry.get('namespace')}: pods={entry.get('pods_total')} cpu={entry.get('cpu_usage')} "
f"mem={entry.get('mem_usage')} primary={entry.get('primary_node')}"
)
if workloads:
lines.append("workload_profiles:")
for entry in workloads[:3]:
if not isinstance(entry, dict):
continue
lines.append(
f"- {entry.get('namespace')}/{entry.get('workload')}: pods={entry.get('pods_total')} "
f"running={entry.get('pods_running')} node={entry.get('primary_node')}"
)
def _append_units_windows(lines: list[str], summary: dict[str, Any]) -> None:
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
units = metrics.get("units") if isinstance(metrics.get("units"), dict) else {}
windows = metrics.get("windows") if isinstance(metrics.get("windows"), dict) else {}
if units:
lines.append("units: " + _format_kv_map(units))
else:
lines.append("units: cpu_pct, ram_pct, net=bytes_per_sec, io=bytes_per_sec")
if windows:
lines.append("windows: " + _format_kv_map(windows))
else:
lines.append("windows: rates=5m, restarts=1h")
def _append_node_load_summary(lines: list[str], summary: dict[str, Any]) -> None:
node_load = summary.get("node_load_summary")
if not isinstance(node_load, dict) or not node_load:
return
hardware_by_node = summary.get("hardware_by_node")
hardware_by_node = hardware_by_node if isinstance(hardware_by_node, dict) else {}
top = node_load.get("top")
if isinstance(top, list) and top:
parts = []
for entry in top[:5]:
if not isinstance(entry, dict):
continue
node = entry.get("node") or ""
load = entry.get("load_index")
cpu = entry.get("cpu")
ram = entry.get("ram")
io = entry.get("io")
net = entry.get("net")
pods_total = entry.get("pods_total")
label = f"{node} idx={_format_float(load)}"
if node and node in hardware_by_node:
label += f" hw={hardware_by_node.get(node)}"
if isinstance(pods_total, (int, float)):
label += f" pods={int(pods_total)}"
label += f" cpu={_format_float(cpu)} ram={_format_float(ram)}"
label += f" io={_format_rate_bytes(io)} net={_format_rate_bytes(net)}"
parts.append(label)
if parts:
lines.append("node_load_top: " + "; ".join(parts))
outliers = node_load.get("outliers")
if isinstance(outliers, list) and outliers:
names = [entry.get("node") for entry in outliers if isinstance(entry, dict)]
names = [name for name in names if isinstance(name, str) and name]
if names:
lines.append("node_load_outliers: " + _format_names(names))
def _append_hardware_usage(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901
usage = summary.get("hardware_usage_avg")
if not isinstance(usage, list) or not usage:
return
parts = []
tops: dict[str, tuple[str, float]] = {}
for entry in usage[:5]:
if not isinstance(entry, dict):
continue
hardware = entry.get("hardware")
load = entry.get("load_index")
cpu = entry.get("cpu")
ram = entry.get("ram")
io = entry.get("io")
net = entry.get("net")
if not hardware:
continue
label = f"{hardware} idx={_format_float(load)}"
label += f" cpu={_format_float(cpu)} ram={_format_float(ram)}"
label += f" io={_format_rate_bytes(io)} net={_format_rate_bytes(net)}"
parts.append(label)
for metric, value in (("cpu", cpu), ("ram", ram), ("io", io), ("net", net), ("load", load)):
if isinstance(value, (int, float)):
current = tops.get(metric)
if current is None or float(value) > current[1]:
tops[metric] = (hardware, float(value))
if parts:
lines.append("hardware_usage_avg: " + "; ".join(parts))
if tops:
top_parts = []
for metric in ("cpu", "ram", "io", "net", "load"):
entry = tops.get(metric)
if not entry:
continue
hardware, value = entry
if metric in {"io", "net"}:
rendered = _format_rate_bytes(value)
else:
rendered = _format_float(value)
top_parts.append(f"{metric}={hardware} ({rendered})")
if top_parts:
lines.append("hardware_usage_top: " + "; ".join(top_parts))
def _append_cluster_watchlist(lines: list[str], summary: dict[str, Any]) -> None:
watchlist = summary.get("cluster_watchlist")
if not isinstance(watchlist, list) or not watchlist:
return
lines.append("cluster_watchlist: " + "; ".join(watchlist))
def _append_baseline_deltas(lines: list[str], summary: dict[str, Any]) -> None:
deltas = summary.get("baseline_deltas") if isinstance(summary.get("baseline_deltas"), dict) else {}
nodes = deltas.get("nodes") if isinstance(deltas.get("nodes"), dict) else {}
namespaces = deltas.get("namespaces") if isinstance(deltas.get("namespaces"), dict) else {}
for scope, block in (("nodes", nodes), ("namespaces", namespaces)):
if not isinstance(block, dict):
continue
for metric, entries in block.items():
if not isinstance(entries, list) or not entries:
continue
parts: list[str] = []
for entry in entries[:5]:
if not isinstance(entry, dict):
continue
name = entry.get("node") if scope == "nodes" else entry.get("namespace")
delta = entry.get("delta")
severity = entry.get("severity")
if not isinstance(name, str) or not name or not isinstance(delta, (int, float)):
continue
suffix = f" ({severity})" if isinstance(severity, str) and severity else ""
parts.append(f"{name}={_format_float(delta)}%{suffix}")
if parts:
lines.append(f"{scope}_baseline_delta_{metric}: " + "; ".join(parts))
def _append_pod_issue_summary(lines: list[str], summary: dict[str, Any]) -> None:
issues = summary.get("pod_issue_summary") if isinstance(summary.get("pod_issue_summary"), dict) else {}
waiting = issues.get("waiting_reasons_top") if isinstance(issues.get("waiting_reasons_top"), list) else []
phases = issues.get("phase_reasons_top") if isinstance(issues.get("phase_reasons_top"), list) else []
namespace_issue = issues.get("namespace_issue_top") if isinstance(issues.get("namespace_issue_top"), dict) else {}
waiting_line = _reason_line(waiting, "pod_waiting_reasons_top")
if waiting_line:
lines.append(waiting_line)
phase_line = _reason_line(phases, "pod_phase_reasons_top")
if phase_line:
lines.append(phase_line)
if namespace_issue:
_append_namespace_issue_lines(lines, namespace_issue)
def _reason_line(entries: list[dict[str, Any]], label: str) -> str:
parts = []
for entry in entries[:5]:
if not isinstance(entry, dict):
continue
reason = entry.get("reason")
count = entry.get("count")
if reason:
parts.append(f"{reason}={count}")
if parts:
return f"{label}: " + "; ".join(parts)
return ""
def _append_namespace_issue_lines(lines: list[str], namespace_issue: dict[str, Any]) -> None:
for key, entries in namespace_issue.items():
if not isinstance(entries, list) or not entries:
continue
parts: list[str] = []
for entry in entries[:5]:
if not isinstance(entry, dict):
continue
ns = entry.get("namespace")
value = entry.get("value")
if ns:
parts.append(f"{ns}={value}")
if parts:
lines.append(f"namespace_issue_top_{key}: " + "; ".join(parts))
def _build_cluster_watchlist(summary: dict[str, Any]) -> dict[str, Any]:
items: list[str] = []
nodes_summary = summary.get("nodes_summary") if isinstance(summary.get("nodes_summary"), dict) else {}
not_ready = int(nodes_summary.get("not_ready") or 0)
if not_ready > 0:
items.append(f"not_ready_nodes={not_ready}")
pressure = summary.get("pressure_nodes") if isinstance(summary.get("pressure_nodes"), dict) else {}
pressure_nodes = pressure.get("names") if isinstance(pressure.get("names"), list) else []
if pressure_nodes:
items.append(f"pressure_nodes={len(pressure_nodes)}")
pod_issues = summary.get("pod_issues") if isinstance(summary.get("pod_issues"), dict) else {}
pending_over = int(pod_issues.get("pending_over_15m") or 0)
if pending_over > 0:
items.append(f"pods_pending_over_15m={pending_over}")
workloads = summary.get("workloads_health") if isinstance(summary.get("workloads_health"), dict) else {}
deployments = workloads.get("deployments") if isinstance(workloads.get("deployments"), dict) else {}
statefulsets = workloads.get("statefulsets") if isinstance(workloads.get("statefulsets"), dict) else {}
daemonsets = workloads.get("daemonsets") if isinstance(workloads.get("daemonsets"), dict) else {}
total_not_ready = int(deployments.get("not_ready") or 0) + int(statefulsets.get("not_ready") or 0) + int(daemonsets.get("not_ready") or 0)
if total_not_ready > 0:
items.append(f"workloads_not_ready={total_not_ready}")
flux = summary.get("flux") if isinstance(summary.get("flux"), dict) else {}
flux_not_ready = int(flux.get("not_ready") or 0)
if flux_not_ready > 0:
items.append(f"flux_not_ready={flux_not_ready}")
pvc_usage = summary.get("pvc_usage_top") if isinstance(summary.get("pvc_usage_top"), list) else []
high_pvc = [
entry for entry in pvc_usage if isinstance(entry, dict) and (entry.get("value") or 0) >= PVC_USAGE_CRITICAL
]
if high_pvc:
items.append(f"pvc_usage>={PVC_USAGE_CRITICAL}%")
return {"cluster_watchlist": items} if items else {}
def _capacity_ratio_parts(entries: list[dict[str, Any]], ratio_key: str, usage_key: str, req_key: str) -> list[str]:
parts: list[str] = []
for entry in entries[:5]:
if not isinstance(entry, dict):
continue
ns = entry.get("namespace") or ""
ratio = entry.get(ratio_key)
usage = entry.get(usage_key)
req = entry.get(req_key)
if ns:
parts.append(
f"{ns}={_format_float(ratio)} (usage={_format_float(usage)} req={_format_float(req)})"
)
return parts
def _capacity_headroom_parts(entries: list[dict[str, Any]]) -> list[str]:
parts: list[str] = []
for entry in entries[:5]:
if not isinstance(entry, dict):
continue
ns = entry.get("namespace") or ""
headroom = entry.get("headroom")
if ns:
parts.append(f"{ns}={_format_float(headroom)}")
return parts
def _append_namespace_capacity_summary( # noqa: C901
lines: list[str],
summary: dict[str, Any],
) -> None:
cap = summary.get("namespace_capacity_summary")
if not isinstance(cap, dict) or not cap:
return
cpu_ratio = cap.get("cpu_ratio_top")
if isinstance(cpu_ratio, list):
parts = _capacity_ratio_parts(cpu_ratio, "cpu_usage_ratio", "cpu_usage", "cpu_requests")
if parts:
lines.append("namespace_cpu_ratio_top: " + "; ".join(parts))
mem_ratio = cap.get("mem_ratio_top")
if isinstance(mem_ratio, list):
parts = _capacity_ratio_parts(mem_ratio, "mem_usage_ratio", "mem_usage", "mem_requests")
if parts:
lines.append("namespace_mem_ratio_top: " + "; ".join(parts))
cpu_headroom = cap.get("cpu_headroom_low")
if isinstance(cpu_headroom, list):
parts = _capacity_headroom_parts(cpu_headroom)
if parts:
lines.append("namespace_cpu_headroom_low: " + "; ".join(parts))
mem_headroom = cap.get("mem_headroom_low")
if isinstance(mem_headroom, list):
parts = _capacity_headroom_parts(mem_headroom)
if parts:
lines.append("namespace_mem_headroom_low: " + "; ".join(parts))
cpu_over = cap.get("cpu_overcommitted")
mem_over = cap.get("mem_overcommitted")
if cpu_over is not None or mem_over is not None:
lines.append(f"namespace_overcommitted: cpu={cpu_over} mem={mem_over}")
cpu_over_names = cap.get("cpu_overcommitted_names")
if isinstance(cpu_over_names, list) and cpu_over_names:
names = [name for name in cpu_over_names if isinstance(name, str) and name]
if names:
lines.append("namespace_cpu_overcommitted_names: " + _format_names(names))
mem_over_names = cap.get("mem_overcommitted_names")
if isinstance(mem_over_names, list) and mem_over_names:
names = [name for name in mem_over_names if isinstance(name, str) and name]
if names:
lines.append("namespace_mem_overcommitted_names: " + _format_names(names))
def _append_workloads_by_namespace(lines: list[str], summary: dict[str, Any]) -> None:
workloads = summary.get("workloads")
if not isinstance(workloads, list) or not workloads:
return
by_ns: dict[str, list[dict[str, Any]]] = {}
for item in workloads:
if not isinstance(item, dict):
continue
ns = item.get("namespace") or ""
name = item.get("workload") or ""
if not ns or not name:
continue
by_ns.setdefault(ns, []).append(item)
for ns, items in sorted(by_ns.items()):
items.sort(
key=lambda item: (-int(item.get("pods_total") or 0), item.get("workload") or "")
)
parts = []
for entry in items[:2]:
name = entry.get("workload") or ""
pods = entry.get("pods_total")
primary = entry.get("primary_node")
label = f"{name}({pods})" if pods is not None else name
if primary:
label = f"{label}@{primary}"
if label:
parts.append(label)
if parts:
lines.append(f"workloads_top_{ns}: " + "; ".join(parts))
def _append_lexicon(lines: list[str], summary: dict[str, Any]) -> None:
lexicon = summary.get("lexicon")
if not isinstance(lexicon, dict):
return
terms = lexicon.get("terms") if isinstance(lexicon.get("terms"), list) else []
aliases = lexicon.get("aliases") if isinstance(lexicon.get("aliases"), dict) else {}
for entry in terms[:8]:
if not isinstance(entry, dict):
continue
term = entry.get("term")
meaning = entry.get("meaning")
if term and meaning:
lines.append(f"lexicon_term: {term} => {meaning}")
for key, value in list(aliases.items())[:6]:
if key and value:
lines.append(f"lexicon_alias: {key} => {value}")
def _append_cross_stats(lines: list[str], summary: dict[str, Any]) -> None: # noqa: C901
cross_stats = summary.get("cross_stats")
if not isinstance(cross_stats, dict):
return
node_entries = cross_stats.get("node_metric_top") if isinstance(cross_stats.get("node_metric_top"), list) else []
for entry in node_entries[:10]:
if not isinstance(entry, dict):
continue
metric = entry.get("metric")
node = entry.get("node")
value = entry.get("value")
cpu = entry.get("cpu")
ram = entry.get("ram")
net = entry.get("net")
io = entry.get("io")
pods = entry.get("pods_total")
if metric and node:
parts = [
f"value={_format_float(value)}",
f"cpu={_format_float(cpu)}",
f"ram={_format_float(ram)}",
f"net={_format_float(net)}",
f"io={_format_float(io)}",
]
if pods is not None:
parts.append(f"pods={pods}")
lines.append(f"cross_node_{metric}: {node} " + " ".join(parts))
ns_entries = cross_stats.get("namespace_metric_top") if isinstance(cross_stats.get("namespace_metric_top"), list) else []
for entry in ns_entries[:10]:
if not isinstance(entry, dict):
continue
metric = entry.get("metric")
namespace = entry.get("namespace")
value = entry.get("value")
pods = entry.get("pods_total")
cpu_ratio = entry.get("cpu_ratio")
mem_ratio = entry.get("mem_ratio")
if metric and namespace:
parts = [
f"value={_format_float(value)}",
f"cpu_ratio={_format_float(cpu_ratio)}",
f"mem_ratio={_format_float(mem_ratio)}",
]
if pods is not None:
parts.append(f"pods={pods}")
lines.append(f"cross_namespace_{metric}: {namespace} " + " ".join(parts))
pvc_entries = cross_stats.get("pvc_top") if isinstance(cross_stats.get("pvc_top"), list) else []
for entry in pvc_entries[:5]:
if not isinstance(entry, dict):
continue
namespace = entry.get("namespace")
pvc = entry.get("pvc")
used = entry.get("used_percent")
if namespace and pvc:
lines.append(f"cross_pvc_usage: {namespace}/{pvc} used={_format_float(used)}")
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,72 @@
from __future__ import annotations
from typing import Any
from .core_a import *
from .core_b import *
from .format_a import *
from .format_b import *
from .format_c import *
def summary_text(snapshot: dict[str, Any] | None) -> str:
"""Render the snapshot summary into deterministic prompt text."""
summary = build_summary(snapshot)
if not summary:
return ""
lines: list[str] = []
lines.append("atlas_cluster: Titan Lab Atlas Kubernetes cluster (internal).")
collected_at = snapshot.get("collected_at") if isinstance(snapshot, dict) else None
snapshot_version = snapshot.get("snapshot_version") if isinstance(snapshot, dict) else None
if collected_at or snapshot_version:
bits = []
if collected_at:
bits.append(f"collected_at={collected_at}")
if snapshot_version:
bits.append(f"version={snapshot_version}")
lines.append("snapshot: " + ", ".join(bits))
_append_nodes(lines, summary)
_append_hardware(lines, summary)
_append_hardware_groups(lines, summary)
_append_lexicon(lines, summary)
_append_pressure(lines, summary)
_append_node_facts(lines, summary)
_append_node_ages(lines, summary)
_append_node_taints(lines, summary)
_append_capacity(lines, summary)
_append_pods(lines, summary)
_append_namespace_pods(lines, summary)
_append_namespace_nodes(lines, summary)
_append_node_pods(lines, summary)
_append_pod_issues(lines, summary)
_append_pod_issue_summary(lines, summary)
_append_workload_health(lines, summary)
_append_events(lines, summary)
_append_node_usage_stats(lines, summary)
_append_namespace_usage(lines, summary)
_append_namespace_requests(lines, summary)
_append_namespace_io_net(lines, summary)
_append_pod_usage(lines, summary)
_append_restarts(lines, summary)
_append_job_failures(lines, summary)
_append_jobs(lines, summary)
_append_postgres(lines, summary)
_append_hottest(lines, summary)
_append_pvc_usage(lines, summary)
_append_root_disk_headroom(lines, summary)
_append_namespace_capacity_summary(lines, summary)
_append_baseline_deltas(lines, summary)
_append_longhorn(lines, summary)
_append_workloads(lines, summary)
_append_topology(lines, summary)
_append_workloads_by_namespace(lines, summary)
_append_node_load_summary(lines, summary)
_append_cluster_watchlist(lines, summary)
_append_hardware_usage(lines, summary)
_append_cross_stats(lines, summary)
_append_flux(lines, summary)
_append_signals(lines, summary)
_append_profiles(lines, summary)
_append_units_windows(lines, summary)
return "\n".join(lines)

View File

@ -6,6 +6,17 @@ from typing import Any
class ClaimStore: class ClaimStore:
"""Persist conversation claims for follow-up answers.
Why:
- keep short-lived conversation state durable across turns without
forcing the answer engine to own storage mechanics.
Input/Output:
- accepts a SQLite path and TTL, stores claim payloads, and returns
normalized payload dictionaries when queried.
"""
def __init__(self, path: str, ttl_sec: int) -> None: def __init__(self, path: str, ttl_sec: int) -> None:
self._path = path or ":memory:" self._path = path or ":memory:"
self._ttl = max(60, ttl_sec) self._ttl = max(60, ttl_sec)

21
pyproject.toml Normal file
View File

@ -0,0 +1,21 @@
[tool.pytest.ini_options]
testpaths = ["tests", "testing"]
pythonpath = ["."]
[tool.ruff]
line-length = 100
target-version = "py312"
[tool.ruff.lint]
select = ["E", "F", "W", "B", "C90", "I", "PLR", "RUF", "SIM", "UP", "ARG"]
ignore = ["E501"]
[tool.ruff.lint.per-file-ignores]
"atlasbot/engine/answerer/*.py" = ["F403", "F405", "I001"]
"atlasbot/engine/answerer/__init__.py" = ["C90", "PLR", "SIM", "ARG", "RUF", "UP", "I001"]
"atlasbot/matrix/bot.py" = ["C90", "PLR", "SIM", "ARG", "RUF", "UP", "I001"]
"atlasbot/snapshot/builder/__init__.py" = ["F403", "F405", "I001"]
"atlasbot/snapshot/builder/*.py" = ["F403", "F405", "I001"]
"testing/*.py" = ["PLR0911", "ARG002", "PLR2004"]
"tests/*.py" = ["PLR2004", "I001", "ARG001", "ARG002", "ARG005", "C901", "PLR0915", "UP037"]
"scripts/*.py" = ["PLR0911", "PLR2004"]

42
scripts/check_coverage.py Executable file
View File

@ -0,0 +1,42 @@
#!/usr/bin/env python3
"""Enforce per-file coverage thresholds from SlipCover JSON output."""
from __future__ import annotations
import argparse
import json
from pathlib import Path
def main() -> int:
"""Check each production file against a minimum coverage percentage."""
parser = argparse.ArgumentParser()
parser.add_argument("coverage_json")
parser.add_argument("--root", default="atlasbot")
parser.add_argument("--threshold", type=float, default=95.0)
args = parser.parse_args()
data = json.loads(Path(args.coverage_json).read_text(encoding="utf-8"))
files = data.get("files") if isinstance(data, dict) else {}
violations: list[tuple[float, str]] = []
for path, payload in sorted(files.items()):
if not path.startswith(f"{args.root}/"):
continue
summary = payload.get("summary") if isinstance(payload, dict) else {}
percent = summary.get("percent_covered") if isinstance(summary, dict) else None
if not isinstance(percent, (int, float)):
continue
if float(percent) < args.threshold:
violations.append((float(percent), path))
if violations:
for percent, path in sorted(violations):
print(f"{path}: {percent:.2f}% < {args.threshold:.2f}%")
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())

83
scripts/check_docstrings.py Executable file
View File

@ -0,0 +1,83 @@
#!/usr/bin/env python3
"""Require docstrings on public production APIs."""
from __future__ import annotations
import argparse
import ast
from pathlib import Path
def _needs_docstring(node: ast.AST, *, parent_class: str | None = None) -> bool:
"""Decide whether `node` should carry a contract docstring."""
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
name = node.name
if name.startswith("_") and name != "__init__":
return False
return not (parent_class and name.startswith("_"))
if isinstance(node, ast.ClassDef):
if node.name.startswith("_"):
return False
if any(
(isinstance(dec, ast.Name) and dec.id == "dataclass")
or (isinstance(dec, ast.Call) and isinstance(dec.func, ast.Name) and dec.func.id == "dataclass")
for dec in node.decorator_list
):
return False
if any(
isinstance(base, ast.Name) and base.id in {"Exception", "RuntimeError", "BaseException"}
for base in node.bases
):
return False
return not any(isinstance(base, ast.Name) and base.id == "BaseModel" for base in node.bases)
return False
def _iter_nodes(tree: ast.AST) -> list[tuple[ast.AST, str | None]]:
"""Yield top-level public nodes only.
The gate focuses on the module surface area rather than every internal
method so we can keep contracts on the actual API seams.
"""
items: list[tuple[ast.AST, str | None]] = []
for node in getattr(tree, "body", []):
items.append((node, None))
return items
def main() -> int:
"""Check modules under the production package and report missing contracts."""
parser = argparse.ArgumentParser()
parser.add_argument("--root", default="atlasbot")
args = parser.parse_args()
root = Path(args.root)
violations: list[str] = []
for path in sorted(root.rglob("*.py")):
if "__pycache__" in path.parts or ".venv" in path.parts:
continue
tree = ast.parse(path.read_text(encoding="utf-8"))
for node, parent_class in _iter_nodes(tree):
if not _needs_docstring(node, parent_class=parent_class):
continue
doc = ast.get_docstring(node)
if doc:
continue
if isinstance(node, ast.ClassDef):
violations.append(f"{path}: class {node.name} is missing a docstring")
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
owner = f"{parent_class}." if parent_class else ""
violations.append(f"{path}: {owner}{node.name} is missing a docstring")
if violations:
for item in violations:
print(item)
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())

70
scripts/check_file_sizes.py Executable file
View File

@ -0,0 +1,70 @@
#!/usr/bin/env python3
"""Fail when production Python files exceed the configured line budget.
The gate is intentionally narrow:
- it only checks the `atlasbot/` package tree;
- it treats each file independently;
- it keeps the threshold explicit so CI can ratchet without guesswork.
"""
from __future__ import annotations
import argparse
from pathlib import Path
def _count_lines(path: Path) -> int:
"""Return the physical line count for `path`.
Input:
- `path`: a readable Python source file.
Output:
- The number of newline-delimited lines in the file.
"""
return len(path.read_text(encoding="utf-8").splitlines())
def _iter_python_files(root: Path) -> list[Path]:
"""List production Python files under `root`.
Input:
- `root`: repository package root to scan.
Output:
- Sorted Python file paths, excluding bytecode and hidden caches.
"""
return sorted(
path
for path in root.rglob("*.py")
if path.is_file() and "__pycache__" not in path.parts and ".venv" not in path.parts
)
def main() -> int:
"""Run the size gate and return a process exit code."""
parser = argparse.ArgumentParser()
parser.add_argument("--root", default="atlasbot")
parser.add_argument("--max-lines", type=int, default=500)
args = parser.parse_args()
root = Path(args.root)
violations: list[tuple[int, Path]] = []
for path in _iter_python_files(root):
lines = _count_lines(path)
if lines > args.max_lines:
violations.append((lines, path))
if violations:
for lines, path in sorted(violations, reverse=True):
print(f"{path}: {lines} lines (limit {args.max_lines})")
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())

2
testing/__init__.py Normal file
View File

@ -0,0 +1,2 @@
"""Shared testing helpers for atlasbot."""

108
testing/fakes.py Normal file
View File

@ -0,0 +1,108 @@
"""Reusable test doubles and settings factories."""
from __future__ import annotations
import asyncio
from atlasbot.config import Settings
class FakeLLM:
"""Deterministic LLM double for pipeline tests.
Why:
- keeps the answer engine tests fast and predictable.
Input/Output:
- accepts the same `chat()` signature as the real client;
- returns canned JSON or text snippets based on the prompt content.
"""
def __init__(self) -> None:
self.calls: list[str] = []
async def chat(self, messages, *, model=None, timeout_sec=None):
"""Return a prompt-shaped response and remember the last user prompt."""
prompt = messages[-1]["content"]
self.calls.append(prompt)
if "normalized" in prompt and "keywords" in prompt:
return '{"normalized":"What is Atlas?","keywords":["atlas"]}'
if "needs_snapshot" in prompt:
return '{"needs_snapshot": true, "answer_style": "direct"}'
if "sub-questions" in prompt:
return '[{"id":"q1","question":"What is Atlas?","priority":1}]'
if "sub-question" in prompt:
return "Atlas has 22 nodes."
if "Answer using only the Fact Sheet" in prompt:
return "Atlas has 22 nodes."
if "final response" in prompt:
return "Atlas has 22 nodes."
if "Score response quality" in prompt:
return '{"confidence":80,"relevance":90,"satisfaction":85,"hallucination_risk":"low"}'
if "claims list" in prompt:
return '{"claims": []}'
return "{}"
class SlowFakeLLM(FakeLLM):
"""Variant that sleeps briefly so timeout guards can be exercised."""
async def chat(self, messages, *, model=None, timeout_sec=None):
"""Delay before answering to make budget handling deterministic."""
await asyncio.sleep(0.02)
return await super().chat(messages, model=model, timeout_sec=timeout_sec)
def build_test_settings() -> Settings:
"""Create a fully populated `Settings` instance for unit tests."""
return Settings(
matrix_base="",
auth_base="",
bot_user="",
bot_pass="",
room_alias="",
server_name="",
bot_mentions=(),
matrix_bots=(),
ollama_url="",
ollama_model="base",
ollama_model_fast="fast",
ollama_model_smart="smart",
ollama_model_genius="genius",
ollama_fallback_model="",
ollama_timeout_sec=1.0,
ollama_retries=0,
ollama_api_key="",
http_port=8090,
internal_token="",
kb_dir="",
vm_url="",
ariadne_state_url="",
ariadne_state_token="",
snapshot_ttl_sec=30,
thinking_interval_sec=30,
quick_time_budget_sec=15.0,
smart_time_budget_sec=45.0,
genius_time_budget_sec=180.0,
conversation_ttl_sec=300,
snapshot_pin_enabled=False,
queue_enabled=False,
nats_url="",
nats_stream="",
nats_subject="",
nats_result_bucket="",
fast_max_angles=1,
smart_max_angles=1,
genius_max_angles=1,
fast_max_candidates=1,
smart_max_candidates=1,
genius_max_candidates=1,
fast_llm_calls_max=9,
smart_llm_calls_max=17,
genius_llm_calls_max=32,
llm_limit_multiplier=1.5,
state_db_path="/tmp/atlasbot_test_state.db",
)

View File

@ -1,98 +1,21 @@
"""Answer-engine regression tests."""
from __future__ import annotations
import asyncio import asyncio
from dataclasses import replace from dataclasses import replace
from atlasbot.engine.answerer import AnswerEngine from atlasbot.engine.answerer import AnswerEngine
from atlasbot.knowledge.loader import KnowledgeBase from atlasbot.knowledge.loader import KnowledgeBase
from atlasbot.snapshot.builder import SnapshotProvider from atlasbot.snapshot.builder import SnapshotProvider
from atlasbot.config import Settings from testing.fakes import FakeLLM, SlowFakeLLM, build_test_settings
class FakeLLM: def test_engine_answer_basic() -> None:
def __init__(self) -> None: """The quick path should answer from the fact sheet."""
self.calls: list[str] = []
async def chat(self, messages, *, model=None, timeout_sec=None):
prompt = messages[-1]["content"]
self.calls.append(prompt)
if "normalized" in prompt and "keywords" in prompt:
return '{"normalized":"What is Atlas?","keywords":["atlas"]}'
if "needs_snapshot" in prompt:
return '{"needs_snapshot": true, "answer_style": "direct"}'
if "sub-questions" in prompt:
return '[{"id":"q1","question":"What is Atlas?","priority":1}]'
if "sub-question" in prompt:
return "Atlas has 22 nodes."
if "Answer using only the Fact Sheet" in prompt:
return "Atlas has 22 nodes."
if "final response" in prompt:
return "Atlas has 22 nodes."
if "Score response quality" in prompt:
return '{"confidence":80,"relevance":90,"satisfaction":85,"hallucination_risk":"low"}'
if "claims list" in prompt:
return '{"claims": []}'
return "{}"
class SlowFakeLLM(FakeLLM):
async def chat(self, messages, *, model=None, timeout_sec=None):
await asyncio.sleep(0.02)
return await super().chat(messages, model=model, timeout_sec=timeout_sec)
def _settings() -> Settings:
return Settings(
matrix_base="",
auth_base="",
bot_user="",
bot_pass="",
room_alias="",
server_name="",
bot_mentions=(),
matrix_bots=(),
ollama_url="",
ollama_model="base",
ollama_model_fast="fast",
ollama_model_smart="smart",
ollama_model_genius="genius",
ollama_fallback_model="",
ollama_timeout_sec=1.0,
ollama_retries=0,
ollama_api_key="",
http_port=8090,
internal_token="",
kb_dir="",
vm_url="",
ariadne_state_url="",
ariadne_state_token="",
snapshot_ttl_sec=30,
thinking_interval_sec=30,
quick_time_budget_sec=15.0,
smart_time_budget_sec=45.0,
genius_time_budget_sec=180.0,
conversation_ttl_sec=300,
snapshot_pin_enabled=False,
queue_enabled=False,
nats_url="",
nats_stream="",
nats_subject="",
nats_result_bucket="",
fast_max_angles=1,
smart_max_angles=1,
genius_max_angles=1,
fast_max_candidates=1,
smart_max_candidates=1,
genius_max_candidates=1,
fast_llm_calls_max=9,
smart_llm_calls_max=17,
genius_llm_calls_max=32,
llm_limit_multiplier=1.5,
state_db_path="/tmp/atlasbot_test_state.db",
)
def test_engine_answer_basic():
llm = FakeLLM() llm = FakeLLM()
settings = _settings() settings = build_test_settings()
kb = KnowledgeBase("") kb = KnowledgeBase("")
snapshot = SnapshotProvider(settings) snapshot = SnapshotProvider(settings)
engine = AnswerEngine(settings, llm, kb, snapshot) engine = AnswerEngine(settings, llm, kb, snapshot)
@ -101,9 +24,11 @@ def test_engine_answer_basic():
assert "Atlas has 22 nodes" in result.reply assert "Atlas has 22 nodes" in result.reply
def test_smart_mode_uses_factsheet_path(): def test_smart_mode_uses_factsheet_path() -> None:
"""Smart mode should stay on the factsheet branch for direct cluster questions."""
llm = FakeLLM() llm = FakeLLM()
settings = _settings() settings = build_test_settings()
kb = KnowledgeBase("") kb = KnowledgeBase("")
snapshot = SnapshotProvider(settings) snapshot = SnapshotProvider(settings)
engine = AnswerEngine(settings, llm, kb, snapshot) engine = AnswerEngine(settings, llm, kb, snapshot)
@ -113,9 +38,11 @@ def test_smart_mode_uses_factsheet_path():
assert "time budget" not in result.reply.lower() assert "time budget" not in result.reply.lower()
def test_genius_mode_uses_factsheet_path(): def test_genius_mode_uses_factsheet_path() -> None:
"""Genius mode should also return the factsheet answer for the same query."""
llm = FakeLLM() llm = FakeLLM()
settings = _settings() settings = build_test_settings()
kb = KnowledgeBase("") kb = KnowledgeBase("")
snapshot = SnapshotProvider(settings) snapshot = SnapshotProvider(settings)
engine = AnswerEngine(settings, llm, kb, snapshot) engine = AnswerEngine(settings, llm, kb, snapshot)
@ -125,9 +52,11 @@ def test_genius_mode_uses_factsheet_path():
assert "time budget" not in result.reply.lower() assert "time budget" not in result.reply.lower()
def test_plain_math_question_is_rejected_for_cluster_modes(): def test_plain_math_question_is_rejected_for_cluster_modes() -> None:
"""The bot should keep users on cluster questions instead of generic math."""
llm = FakeLLM() llm = FakeLLM()
settings = _settings() settings = build_test_settings()
kb = KnowledgeBase("") kb = KnowledgeBase("")
snapshot = SnapshotProvider(settings) snapshot = SnapshotProvider(settings)
engine = AnswerEngine(settings, llm, kb, snapshot) engine = AnswerEngine(settings, llm, kb, snapshot)
@ -136,9 +65,11 @@ def test_plain_math_question_is_rejected_for_cluster_modes():
assert "focus on Titan cluster operations" in result.reply assert "focus on Titan cluster operations" in result.reply
def test_quick_mode_time_budget_guard(): def test_quick_mode_time_budget_guard() -> None:
"""A slow model call should trip the quick-mode budget guard."""
llm = SlowFakeLLM() llm = SlowFakeLLM()
settings = replace(_settings(), quick_time_budget_sec=0.01) settings = replace(build_test_settings(), quick_time_budget_sec=0.01)
kb = KnowledgeBase("") kb = KnowledgeBase("")
snapshot = SnapshotProvider(settings) snapshot = SnapshotProvider(settings)
engine = AnswerEngine(settings, llm, kb, snapshot) engine = AnswerEngine(settings, llm, kb, snapshot)

View File

@ -0,0 +1,810 @@
"""Targeted quality-gate coverage for runtime and answerer orchestration."""
from __future__ import annotations
import asyncio
import json
from dataclasses import replace
from pathlib import Path
from types import SimpleNamespace
from typing import Any
import httpx
import pytest
from atlasbot.api.http import Api, AnswerRequest
from atlasbot.config import MatrixBotConfig
from atlasbot.engine.answerer import (
AnswerEngine,
AnswerResult,
AnswerScores,
ClaimItem,
EvidenceItem,
ModePlan,
)
from atlasbot.engine.answerer.common import _mode_plan
from atlasbot.engine.answerer.engine import AnswerEngine as EngineClass
from atlasbot.engine.answerer.workflow import run_answer
from atlasbot.engine.answerer.workflow_post import finalize_answer
from atlasbot.knowledge.loader import KnowledgeBase
from atlasbot.llm.client import LLMClient, LLMError, parse_json
from atlasbot.main import result_scores
from atlasbot.matrix.bot import MatrixBot, MatrixClient
from atlasbot.queue.nats import QueueManager
from atlasbot.snapshot.builder import SnapshotProvider, build_summary
from testing.fakes import build_test_settings
from tests.test_support_modules import _rich_snapshot
class StaticSnapshot:
"""Return a fixed snapshot for answer-engine tests."""
def __init__(self, payload: dict[str, Any]) -> None:
self._payload = payload
def get(self) -> dict[str, Any]:
"""Return the stored snapshot payload."""
return self._payload
class PromptLLM:
"""Map prompt fragments to canned responses for workflow tests."""
def __init__(self) -> None:
self.calls: list[tuple[str, str]] = []
async def chat(
self,
messages: list[dict[str, str]],
*,
model: str | None = None,
timeout_sec: float | None = None,
) -> str:
"""Return the scripted response for the latest user prompt."""
del timeout_sec
system = messages[0]["content"]
prompt = messages[-1]["content"]
self.calls.append((model or "", prompt))
if "Given chunk summaries, score relevance" in prompt:
items = []
for line in prompt.splitlines():
if line.startswith("- c"):
chunk_id = line.split()[1].rstrip(":")
score = 95 if "cpu" in line.lower() or "synapse" in line.lower() else 80
items.append({"id": chunk_id, "score": score, "reason": "relevant"})
return json.dumps(items or [{"id": "c0", "score": 90, "reason": "relevant"}])
direct = self._direct_response(prompt)
if direct is not None:
return direct
response = self._lookup_response(system, prompt)
if response is not None:
return response
raise AssertionError(f"Unhandled prompt:\nSYSTEM={system}\nPROMPT={prompt}")
def _direct_response(self, prompt: str) -> str | None:
"""Return direct string responses for a few prompt families."""
if "Answer the sub-question using the context" in prompt:
return "The best runbook path is runbooks/fix.md." if "runbook" in prompt.lower() else "synapse is hottest with cpu 95 on titan-01."
markers = [
("Write a final response to the user", "titan-99 is hottest and the runbook is runbooks/wrong.md."),
("Draft:", "synapse is hottest at cpu 95 on titan-01, and amd64 nodes remain separate from raspberry hardware."),
("Return JSON with fields: issues", '{"issues":["mention the exact runbook"],"missing_data":[],"risky_claims":[]}'),
("command (string), rationale", '{"command":"kubectl top pods -n synapse","rationale":"verify namespace cpu"}'),
("confidence (0-100)", '{"confidence":88,"relevance":91,"satisfaction":86,"hallucination_risk":"low"}'),
]
for marker, response in markers:
if marker in prompt:
if marker == "Draft:" and "If Facts are provided" not in prompt:
continue
return response
return None
def _lookup_response(self, system: str, prompt: str) -> str | None:
"""Return canned responses for prompt markers."""
del system
markers = [
(
"normalized (string), keywords",
'{"normalized":"Which namespace is hottest on raspberry hardware and which runbook should I use?","keywords":["namespace","hottest","cpu","raspberry","runbook"]}',
),
(
"needs_snapshot (bool)",
'{"needs_snapshot":true,"needs_kb":true,"needs_tool":true,"answer_style":"insightful","follow_up":false,"question_type":"open_ended","focus_entity":"namespace","focus_metric":"cpu"}',
),
(
"Generate up to",
'[{"id":"q1","question":"Which namespace is hottest?","priority":5,"kind":"metric"},{"id":"q2","question":"Which runbook applies?","priority":4,"kind":"context"}]',
),
("Choose the run that best aligns", '{"selected_index": 1}'),
("AvailableKeys:", '{"keys":["namespace_cpu_top","namespace_pods","hardware_nodes"]}'),
("Return JSON with field: missing", '{"missing":[]}'),
("Return JSON with fields: prefixes", '{"prefixes":["namespace","hottest"]}'),
("fact_types", '{"fact_types":["namespace_cpu_top","hardware_nodes"]}'),
("Return JSON with field: signals", '{"signals":["cpu","synapse","raspberry"]}'),
(
"Signals:",
'{"lines":["namespace_cpu_top: synapse=95","hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)"]}',
),
(
"Return JSON with field: lines",
'{"lines":["namespace_cpu_top: synapse=95","hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)"]}',
),
(
"CandidateFacts:",
'{"lines":["namespace_cpu_top: synapse=95","hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)"]}',
),
(
"FactCandidates:",
'{"lines":["namespace_cpu_top: synapse=95","hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)"]}',
),
(
"Suggest a safe, read-only command",
'{"command":"kubectl top pods -n synapse","rationale":"verify namespace cpu"}',
),
("Pick the best candidate for accuracy and grounding", '{"best": 1}'),
("Pick the best draft for accuracy", '{"best": 1}'),
("Pick the best runbook path", '{"path":"runbooks/fix.md"}'),
("Check the draft against the context", "synapse is hottest on titan-01, but see runbooks/wrong.md."),
("Answer using the fact", "Latest metrics: namespace_cpu_top: synapse=95."),
("Rewrite the draft to only include claims supported by FactsUsed", "synapse is hottest on titan-01."),
("Check if an open-ended answer includes at least two concrete signals", '{"ok": false, "reason": "needs more detail"}'),
("ok (bool), reason (string)", '{"ok": false, "reason": "needs more detail"}'),
("Rewrite the answer using the critique", "synapse is hottest at cpu 95 on titan-01. Use runbooks/fix.md."),
("Return JSON with field: note", '{"note":"The answer would benefit from per-pod CPU samples."}'),
("Score response quality", '{"confidence":88,"relevance":91,"satisfaction":86,"hallucination_risk":"low"}'),
(
"Return JSON with fields: confidence (0-100), relevance (0-100), satisfaction (0-100), hallucination_risk (low|medium|high).",
'{"confidence":88,"relevance":91,"satisfaction":86,"hallucination_risk":"low"}',
),
(
"claims list",
'{"claims":[{"id":"c1","claim":"synapse is hottest","evidence":[{"path":"hottest.cpu.node","reason":"snapshot"}]}]}',
),
("Select the claims most relevant", '{"claim_ids":["c1"]}'),
("Follow-up:", "titan-99 is still hottest."),
("Rewrite the answer to be concise and directly answer the question", "Latest metrics: namespace_cpu_top: synapse=95."),
("Deduplicate repeated statements", "Latest metrics: namespace_cpu_top: synapse=95."),
("Answer using only the Fact Sheet", "Fact sheet answer: namespace_cpu_top: synapse=95. Use runbooks/fix.md."),
]
for marker, response in markers:
if marker in prompt:
return response
return None
class TimeoutLLM:
"""Raise a timeout as soon as the workflow makes an LLM call."""
async def chat(
self,
messages: list[dict[str, str]],
*,
model: str | None = None,
timeout_sec: float | None = None,
) -> str:
"""Trigger the workflow timeout handling branch."""
del messages, model, timeout_sec
raise TimeoutError("boom")
class LimitLLM(PromptLLM):
"""Reuse prompt handling while allowing the workflow to hit call caps."""
def _settings(tmp_path: Path, **overrides: Any):
"""Build settings with an isolated claim-store path."""
return replace(build_test_settings(), state_db_path=str(tmp_path / "state.db"), **overrides)
def _make_engine(tmp_path: Path, llm: Any, **setting_overrides: Any) -> AnswerEngine:
"""Construct a real engine with static snapshot and KB doubles."""
settings = _settings(tmp_path, **setting_overrides)
snapshot = StaticSnapshot(_rich_snapshot())
kb = KnowledgeBase("")
kb.summary = lambda: "KB summary." # type: ignore[method-assign]
kb.runbook_titles = lambda limit=5: "Relevant runbooks:\n- Fix (runbooks/fix.md)" # type: ignore[method-assign]
kb.runbook_paths = lambda limit=10: ["runbooks/fix.md"] # type: ignore[method-assign]
kb.chunk_lines = lambda max_files=20, max_chars=6000: [ # type: ignore[method-assign]
"runbooks/fix.md",
"namespace_cpu_top: synapse=95",
"hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)",
]
return AnswerEngine(settings, llm, kb, snapshot) # type: ignore[arg-type]
def test_engine_helper_methods_cover_state_and_followup(tmp_path: Path) -> None:
"""Cover answer-engine helper branches outside the main workflow."""
settings = _settings(tmp_path)
class StockLLM:
async def chat(self, messages, *, model=None, timeout_sec=None):
del messages, model, timeout_sec
return "stock reply"
engine = EngineClass(settings, StockLLM(), KnowledgeBase(""), StaticSnapshot(_rich_snapshot()))
async def call_llm(_system: str, _prompt: str, *, context: str | None = None, model: str | None = None, tag: str = "") -> str:
del _system, context, model
static = {
"draft_select": '{"best": 2}',
"score": '{"confidence":90,"relevance":91,"satisfaction":92,"hallucination_risk":"low"}',
"claim_map": '{"claims":[{"id":"c1","claim":"cpu is high","evidence":[{"path":"hottest.cpu.node","reason":"why"},{"path":"","reason":"skip"}]},"bad"]}',
"select_claims": '{"claim_ids":["c1"]}',
"followup": "titan-99 is hottest. The draft is correct.",
"followup_fix": "titan-01 is hottest.",
"dedup_followup": "The draft is correct. titan-01 is hottest.",
"dedup": "deduped",
}
if tag == "synth":
return "draft one" if "DraftIndex: 1" in _prompt else "draft two"
if tag in static:
return static[tag]
raise AssertionError(tag)
stock = asyncio.run(engine._answer_stock("hello"))
assert stock.reply == "stock reply"
plan = replace(_mode_plan(settings, "smart"), drafts=2, parallelism=2)
synth = asyncio.run(
engine._synthesize_answer(
"Which node is hottest?",
["draft one", "draft two"],
"ctx",
{"question_type": "metric", "answer_style": "direct"},
plan,
call_llm,
)
)
synth_empty = asyncio.run(
engine._synthesize_answer(
"Which node is hottest?",
[],
"ctx",
{"question_type": "metric", "answer_style": "direct"},
replace(plan, drafts=1, parallelism=1),
call_llm,
)
)
assert synth == "draft two"
assert synth_empty == "draft two"
scored = asyncio.run(engine._score_answer("q", "a", plan, call_llm))
assert scored.hallucination_risk == "low"
assert asyncio.run(engine._score_answer("q", "a", replace(plan, use_scores=False), call_llm)).confidence == 60
summary = build_summary(_rich_snapshot())
claims = asyncio.run(engine._extract_claims("q", "a", summary, ["fact"], call_llm))
assert claims and claims[0].evidence[0].path == "hottest.cpu.node"
assert asyncio.run(engine._extract_claims("q", "", summary, [], call_llm)) == []
assert asyncio.run(engine._dedup_reply("one. one. one.", plan, call_llm, "dedup")) == "deduped"
assert asyncio.run(engine._dedup_reply("single answer", plan, call_llm, "dedup")) == "single answer"
engine._store_state("conv-1", claims, summary, _rich_snapshot(), True)
state = engine._get_state("conv-1")
assert state and state.snapshot
assert engine._get_state(None) is None
engine._cleanup_state()
followup = asyncio.run(
engine._answer_followup(
"Which hardware hotspot is there?",
state,
summary,
{"question_type": "diagnostic"},
plan,
call_llm,
)
)
assert "titan-01" in followup
assert asyncio.run(engine._select_claims("what about that?", claims, plan, call_llm)) == ["c1"]
assert asyncio.run(engine._select_claims("what about that?", [], plan, call_llm)) == []
def test_finalize_answer_covers_post_processing_branches(tmp_path: Path) -> None:
"""Exercise evidence-fix, runbook, guard, critic, and gap paths."""
settings = _settings(tmp_path)
plan = replace(_mode_plan(settings, "smart"), use_gap=True, use_critic=True)
summary = build_summary(_rich_snapshot())
summary_lines = [
"namespace_cpu_top: synapse=95",
"hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)",
"runbooks/fix.md",
]
observed: list[tuple[str, str]] = []
async def call_llm(_system: str, _prompt: str, *, context: str | None = None, model: str | None = None, tag: str = "") -> str:
del _system, context, model
responses = {
"runbook_select": '{"path":"runbooks/fix.md"}',
"evidence_fix": "titan-99 is hottest and see runbooks/wrong.md.",
"evidence_fix_enforce": "titan-99 is hottest and see runbooks/wrong.md.",
"metric_direct": "no numbers here",
"runbook_enforce": "Non-Raspberry Pi nodes: amd64 (titan-02). Use runbooks/fix.md.",
"evidence_guard": "Non-Raspberry Pi nodes: amd64 (titan-02). Use runbooks/fix.md.",
"focus_fix": "Latest metrics: namespace_cpu_top: synapse=95.",
"insight_guard": '{"ok": false, "reason": "needs more detail"}',
"insight_fix": "Latest metrics: namespace_cpu_top: synapse=95. Use runbooks/fix.md.",
"critic": '{"issues":["too vague"]}',
"revise": "Latest metrics: namespace_cpu_top: synapse=95. Use runbooks/fix.md.",
"gap": '{"note":"The answer would benefit from per-pod CPU samples."}',
}
if tag not in responses:
raise AssertionError(_prompt)
return responses[tag]
class FinalizeEngine:
async def _synthesize_answer(self, *args: Any) -> str:
return "titan-99 is hottest and see runbooks/wrong.md."
async def _dedup_reply(self, reply: str, _plan: ModePlan, _call_llm, tag: str) -> str:
assert tag == "dedup"
return reply
async def _score_answer(self, _question: str, _reply: str, _plan: ModePlan, _call_llm) -> AnswerScores:
return AnswerScores(80, 81, 82, "low")
async def _extract_claims(self, _question: str, _reply: str, _summary: dict[str, Any], _facts_used: list[str], _call_llm) -> list[ClaimItem]:
return [ClaimItem(id="c1", claim="cpu high", evidence=[EvidenceItem(path="hottest.cpu.node", reason="snapshot")])]
reply, scores, claims = asyncio.run(
finalize_answer(
engine=FinalizeEngine(),
call_llm=call_llm,
normalized="Which namespace is hottest on raspberry hardware and which runbook should I use?",
subanswers=["synapse is hottest"],
context="ctx",
classify={"question_type": "open_ended", "answer_style": "direct"},
plan=plan,
summary=summary,
summary_lines=summary_lines,
metric_facts=["namespace_cpu_top: synapse=95"],
key_facts=["namespace_cpu_top: synapse=95"],
facts_used=["hardware_nodes: rpi5=(titan-01) | amd64=(titan-02)"],
allowed_nodes=["titan-01", "titan-02"],
allowed_namespaces=["synapse"],
runbook_paths=["runbooks/fix.md"],
lowered_question="which namespace is hottest on raspberry hardware and which runbook should i use?",
force_metric=True,
keyword_tokens=["namespace", "cpu", "raspberry"],
question_tokens=["namespace", "cpu", "raspberry"],
snapshot_context="ClusterSnapshot:\nnamespace_cpu_top: synapse=95",
observer=lambda stage, note: observed.append((stage, note)),
mode="smart",
metric_keys=["namespace_cpu_top"],
)
)
assert "runbooks/fix.md" in reply
assert "synapse=95" in reply
assert scores.confidence == 80
assert claims and claims[0].id == "c1"
assert ("evidence_fix", "repairing missing evidence") in observed
assert ("critic", "reviewing") in observed
assert ("gap", "checking gaps") in observed
def test_run_answer_deep_workflow_persists_state(tmp_path: Path) -> None:
"""Drive the full smart workflow through retrieval, synthesis, and post-processing."""
engine = _make_engine(tmp_path, PromptLLM())
observed: list[tuple[str, str]] = []
result = asyncio.run(
run_answer(
engine,
"Run limitless Which namespace is hottest on raspberry hardware and which runbook should I use?",
mode="smart",
history=[{"q": "before", "a": "earlier"}],
observer=lambda stage, note: observed.append((stage, note)),
conversation_id="room-1",
snapshot_pin=True,
)
)
assert "runbooks/fix.md" in result.reply
assert result.meta["tool_hint"]["command"] == "kubectl top pods -n synapse"
state = engine._get_state("room-1")
assert state and state.claims and state.snapshot
stages = {stage for stage, _note in observed}
assert {"normalize", "route", "retrieve", "tool", "subanswers", "synthesize"} <= stages
def test_run_answer_followup_and_limits(tmp_path: Path) -> None:
"""Cover follow-up routing, reasoning limit, and timeout fallbacks."""
class FollowupLLM(PromptLLM):
def _lookup_response(self, system: str, prompt: str) -> str | None:
if "normalized (string), keywords" in prompt:
return '{"normalized":"What about that?","keywords":["that"]}'
if "needs_snapshot (bool)" in prompt:
return '{"needs_snapshot":true,"needs_kb":false,"needs_tool":false,"answer_style":"direct","follow_up":false,"question_type":"open_ended","focus_entity":"unknown","focus_metric":"unknown"}'
if "Select the claims most relevant" in prompt:
return '{"claim_ids":["c1"]}'
if "Follow-up:" in prompt:
return "titan-99 is still hottest."
return super()._lookup_response(system, prompt)
engine = _make_engine(tmp_path, FollowupLLM())
summary = build_summary(_rich_snapshot())
engine._store_state(
"conv-1",
[ClaimItem(id="c1", claim="synapse is hottest", evidence=[EvidenceItem(path="hottest.cpu.node", reason="snapshot", value_at_claim="titan-01")])],
summary,
_rich_snapshot(),
True,
)
followup = asyncio.run(
run_answer(
engine,
"Run limitless What about that?",
mode="smart",
conversation_id="conv-1",
snapshot_pin=True,
)
)
assert "titan-01" in followup.reply
limit_engine = _make_engine(
tmp_path / "limit",
LimitLLM(),
fast_llm_calls_max=1,
llm_limit_multiplier=1.0,
)
limited = asyncio.run(run_answer(limit_engine, "tell me about cpu and runbooks", mode="custom"))
assert "reasoning limit" in limited.reply
assert limited.meta["llm_limit_hit"] is True
timeout_engine = _make_engine(
tmp_path / "timeout",
TimeoutLLM(),
smart_time_budget_sec=0.1,
ollama_timeout_sec=0.1,
)
timed_out = asyncio.run(run_answer(timeout_engine, "Run limitless tell me about cpu and runbooks", mode="smart"))
assert "time budget" in timed_out.reply.lower()
assert timed_out.meta["time_budget_hit"] is True
def test_api_matrix_queue_main_and_store_edge_paths(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
"""Exercise remaining API, Matrix, queue, main, and store branches."""
settings = _settings(
tmp_path,
internal_token="secret",
queue_enabled=True,
matrix_bots=(MatrixBotConfig("bot", "pw", ("atlas",), "quick"),),
)
async def handler(
question: str,
mode: str,
history: list[dict[str, str]] | None,
conversation_id: str | None,
snapshot_pin: bool | None,
) -> AnswerResult:
del history, conversation_id, snapshot_pin
return AnswerResult(question + ":" + mode, AnswerScores(1, 2, 3, "low"), {"mode": mode})
api = Api(settings, handler)
from fastapi.testclient import TestClient
client = TestClient(api.app)
assert client.post("/v1/answer", headers={"X-Internal-Token": "secret"}, json={}).status_code == 400
assert client.post("/v1/answer", headers={"X-Internal-Token": "secret"}, json={"content": "hi"}).json()["reply"] == "hi:quick"
assert client.post("/v1/answer", headers={"X-Internal-Token": "secret"}, json={"question": " "}).status_code == 400
assert AnswerRequest(message=" hello ").message == " hello "
class FakeResp:
def __init__(self, payload: dict[str, Any], *, status_code: int = 200) -> None:
self._payload = payload
self.status_code = status_code
def raise_for_status(self) -> None:
if self.status_code >= 400:
raise httpx.HTTPStatusError("bad", request=httpx.Request("GET", "http://x"), response=httpx.Response(self.status_code))
def json(self) -> dict[str, Any]:
return self._payload
class MatrixAsyncClient:
async def __aenter__(self) -> "MatrixAsyncClient":
return self
async def __aexit__(self, *exc: object) -> None:
return None
async def post(self, url: str, json: dict[str, Any] | None = None, headers: dict[str, str] | None = None) -> FakeResp:
del json, headers
if "login" in url:
return FakeResp({"access_token": "tok"})
return FakeResp({})
async def get(self, url: str, headers: dict[str, str] | None = None, params: dict[str, Any] | None = None) -> FakeResp:
del headers, params
if "directory/room" in url:
return FakeResp({}, status_code=404)
return FakeResp({"next_batch": "n1", "rooms": {"join": {}}})
monkeypatch.setattr("atlasbot.matrix.bot.httpx.AsyncClient", lambda timeout=None: MatrixAsyncClient())
matrix_client = MatrixClient(settings, settings.matrix_bots[0])
assert asyncio.run(matrix_client.login()) == "tok"
assert asyncio.run(matrix_client.resolve_room("tok")) == ""
bot = MatrixBot(settings, settings.matrix_bots[0], SimpleNamespace(answer=None), handler)
class BotClient:
def __init__(self) -> None:
self.sent: list[str] = []
self.sync_calls = 0
async def login(self) -> str:
return "tok"
async def resolve_room(self, token: str) -> str:
del token
return "!room"
async def join_room(self, token: str, room_id: str) -> None:
del token, room_id
async def send_message(self, token: str, room_id: str, text: str) -> None:
del token, room_id
self.sent.append(text)
async def sync(self, token: str, since: str | None) -> dict[str, Any]:
del token, since
self.sync_calls += 1
if self.sync_calls == 1:
return {
"next_batch": "n1",
"rooms": {
"join": {
"!room": {
"timeline": {
"events": [
{"type": "m.room.member", "sender": "user"},
{"type": "m.room.message", "sender": "bot", "content": {"body": "ignore"}},
{"type": "m.room.message", "sender": "user", "content": {"body": "atlas quick hi"}},
]
}
}
}
},
}
raise RuntimeError("stop")
bot._client = BotClient()
async def run_bot_once() -> None:
task = asyncio.create_task(bot.run())
await asyncio.sleep(0.01)
task.cancel()
with pytest.raises(asyncio.CancelledError):
await task
asyncio.run(run_bot_once())
assert any("Thinking" in msg for msg in bot._client.sent)
timeout_bot = MatrixBot(replace(settings, thinking_interval_sec=0.001, quick_time_budget_sec=0.01), settings.matrix_bots[0], SimpleNamespace(answer=None), None)
timeout_bot._client = SimpleNamespace(
sent=[],
send_message=lambda token, room_id, text: asyncio.sleep(0, result=timeout_bot._client.sent.append(text)),
)
async def sleepy_handler(question: str, mode: str, history, conversation_id, observer):
del question, mode, history, conversation_id, observer
await asyncio.sleep(1.2)
return AnswerResult("late", AnswerScores(1, 2, 3, "low"), {})
timeout_bot._answer_handler = sleepy_handler
asyncio.run(timeout_bot._answer_with_heartbeat("tok", "!room", "q", "quick"))
assert any("time budget" in msg for msg in timeout_bot._client.sent)
error_bot = MatrixBot(replace(settings, thinking_interval_sec=0.001), settings.matrix_bots[0], SimpleNamespace(answer=None), None)
error_bot._client = SimpleNamespace(
sent=[],
send_message=lambda token, room_id, text: asyncio.sleep(0, result=error_bot._client.sent.append(text)),
)
async def failing_handler(question: str, mode: str, history, conversation_id, observer):
del question, mode, history, conversation_id, observer
raise RuntimeError("boom")
error_bot._answer_handler = failing_handler
asyncio.run(error_bot._answer_with_heartbeat("tok", "!room", "q", "smart"))
assert any("internal error" in msg for msg in error_bot._client.sent)
class DirectQueue:
async def __call__(self, payload: dict[str, Any]) -> dict[str, Any]:
return {"reply": payload["question"]}
direct_qm = QueueManager(replace(settings, queue_enabled=False), DirectQueue())
assert asyncio.run(direct_qm.submit({"question": "direct"})) == {"reply": "direct"}
class FakeSub:
async def next_msg(self, timeout: float) -> Any:
del timeout
return SimpleNamespace(data=json.dumps({"reply": "queued"}).encode())
async def unsubscribe(self) -> None:
return None
class FakeMsg:
def __init__(self, raw: bytes, reply: str = "reply") -> None:
self.data = raw
self.reply = reply
self.acked = False
async def ack(self) -> None:
self.acked = True
published: list[tuple[str, bytes]] = []
class ExistingStreamJS:
async def stream_info(self, stream: str) -> None:
assert stream == settings.nats_stream
async def publish(self, subject: str, data: bytes) -> None:
published.append((subject, data))
async def pull_subscribe(self, subject: str, durable: str):
del subject, durable
class Pull:
def __init__(self) -> None:
self.calls = 0
async def fetch(self, count: int, timeout: float) -> list[FakeMsg]:
del count, timeout
self.calls += 1
if self.calls == 1:
raise RuntimeError("retry")
raise asyncio.CancelledError
return Pull()
class FakeNats:
def __init__(self) -> None:
self.drained = False
async def connect(self, url: str) -> None:
assert url == settings.nats_url
def jetstream(self) -> ExistingStreamJS:
return ExistingStreamJS()
def new_inbox(self) -> str:
return "inbox"
async def subscribe(self, reply: str) -> FakeSub:
assert reply == "inbox"
return FakeSub()
async def publish(self, reply: str, data: bytes) -> None:
published.append((reply, data))
async def drain(self) -> None:
self.drained = True
monkeypatch.setattr("atlasbot.queue.nats.NATS", FakeNats)
queue = QueueManager(settings, DirectQueue())
asyncio.run(queue.start())
assert asyncio.run(queue.submit({"question": "queued", "mode": "smart"})) == {"reply": "queued"}
invalid_msg = FakeMsg(b"not-json")
asyncio.run(queue._handle_message(invalid_msg))
assert invalid_msg.acked is True
handled_msg = FakeMsg(json.dumps({"payload": {"question": "x"}, "reply": "reply"}).encode())
asyncio.run(queue._handle_message(handled_msg))
assert handled_msg.acked is True
failing_queue = QueueManager(settings, lambda payload: (_ for _ in ()).throw(RuntimeError("boom")))
failing_queue._nc = FakeNats()
failing_queue._js = ExistingStreamJS()
failure_msg = FakeMsg(json.dumps({"payload": {"question": "x"}}).encode())
async def failing_handler(payload: dict[str, Any]) -> dict[str, Any]:
del payload
raise RuntimeError("boom")
failing_queue._handler = failing_handler
asyncio.run(failing_queue._handle_message(failure_msg))
assert failure_msg.acked is True
asyncio.run(queue.stop())
assert result_scores({"scores": {"confidence": "9", "relevance": "8", "satisfaction": "7", "hallucination_risk": "low"}}).confidence == 9
assert result_scores({"scores": "bad"}).confidence == 60
def test_kb_llm_snapshot_and_json_edge_paths(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
"""Cover remaining KB, LLM, snapshot, and JSON parsing branches."""
base = tmp_path / "kb"
catalog = base / "catalog"
catalog.mkdir(parents=True)
(catalog / "atlas.json").write_text(json.dumps({"cluster": "atlas", "sources": ["bad"]}), encoding="utf-8")
(catalog / "runbooks.json").write_text(json.dumps([{"title": "Fix", "path": "runbooks/fix.md"}, {"title": "No path"}]), encoding="utf-8")
(base / "docs.md").write_text("x" * 120, encoding="utf-8")
kb = KnowledgeBase(str(base))
assert kb.runbook_titles(limit=1).count("runbooks/fix.md") == 1
assert kb.chunk_lines(max_files=1, max_chars=60)
assert kb._extend_with_limit([], ["abcdef"], 3) is False
empty_kb = KnowledgeBase("")
assert empty_kb.chunk_lines() == []
settings = _settings(tmp_path, ollama_url="http://example/api/chat", ollama_api_key="secret", ollama_retries=0, ollama_fallback_model="")
client = LLMClient(settings)
assert client._endpoint() == "http://example/api/chat"
assert client._headers["x-api-key"] == "secret"
assert parse_json("```{\"ok\": true}```") == {"ok": True}
assert parse_json("not-json", fallback={"fallback": True}) == {"fallback": True}
class FakeResponse:
def __init__(self, status_code: int, payload: Any) -> None:
self.status_code = status_code
self._payload = payload
def raise_for_status(self) -> None:
if self.status_code >= 400:
raise httpx.HTTPStatusError("bad", request=httpx.Request("POST", "http://example"), response=httpx.Response(self.status_code))
def json(self) -> Any:
return self._payload
responses = iter([FakeResponse(200, {"response": "plain"}), FakeResponse(200, {"reply": "fallback"}), FakeResponse(200, {"message": {}})])
class FakeAsyncClient:
def __init__(self, timeout: float | None = None) -> None:
self.timeout = timeout
async def __aenter__(self) -> "FakeAsyncClient":
return self
async def __aexit__(self, *exc: object) -> None:
return None
async def post(self, _url: str, *, json: dict[str, Any], headers: dict[str, str]) -> FakeResponse:
del _url, json, headers
item = next(responses)
if isinstance(item, Exception):
raise item
return item
monkeypatch.setattr(httpx, "AsyncClient", FakeAsyncClient)
assert asyncio.run(client.chat([{"role": "user", "content": "a"}], timeout_sec=1.0)) == "plain"
assert asyncio.run(client.chat([{"role": "user", "content": "b"}], timeout_sec=1.0)) == "fallback"
with pytest.raises(LLMError, match="empty response"):
asyncio.run(client.chat([{"role": "user", "content": "c"}], timeout_sec=1.0))
error_settings = replace(settings, ollama_retries=1)
error_client = LLMClient(error_settings)
error_responses = iter([httpx.ConnectError("nope"), httpx.ConnectError("still nope")])
class ErrorAsyncClient(FakeAsyncClient):
async def post(self, _url: str, *, json: dict[str, Any], headers: dict[str, str]) -> FakeResponse:
del _url, json, headers
raise next(error_responses)
monkeypatch.setattr(httpx, "AsyncClient", ErrorAsyncClient)
with pytest.raises(LLMError):
asyncio.run(error_client.chat([{"role": "user", "content": "d"}], timeout_sec=1.0))
provider = SnapshotProvider(replace(settings, ariadne_state_url="http://snapshot", ariadne_state_token="tok"))
class SnapshotResp:
def raise_for_status(self) -> None:
return None
def json(self) -> dict[str, Any]:
return {"snapshot_id": "snap-1"}
monkeypatch.setattr("atlasbot.snapshot.builder.httpx.get", lambda url, headers, timeout: SnapshotResp())
assert provider.get() == {"snapshot_id": "snap-1"}
provider._cache = {"snapshot_id": "cached"}
provider._cache_ts = 10_000.0
monkeypatch.setattr("atlasbot.snapshot.builder.time.monotonic", lambda: 10_001.0)
assert provider.get() == {"snapshot_id": "cached"}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff