atlasbot/atlasbot/knowledge/loader.py

import json
import logging
from pathlib import Path
from typing import Any

log = logging.getLogger(__name__)


class KnowledgeBase:
    def __init__(self, base_dir: str) -> None:
        self._base = Path(base_dir) if base_dir else None
        self._atlas: dict[str, Any] = {}
        self._runbooks: list[dict[str, Any]] = []
        self._loaded = False

    def load(self) -> None:
        if self._loaded or not self._base:
            return
        self._atlas = self._read_json(self._base / "catalog" / "atlas.json")
        self._runbooks = self._read_json(self._base / "catalog" / "runbooks.json") or []
        self._loaded = True

    def _read_json(self, path: Path) -> dict[str, Any] | list[dict[str, Any]]:
        if not path.exists():
            return {}
        try:
            return json.loads(path.read_text())
        except Exception as exc:
            log.warning("kb load failed", extra={"extra": {"path": str(path), "error": str(exc)}})
            return {}

    def summary(self) -> str:
        self.load()
        if not self._atlas:
            return ""
        cluster = self._atlas.get("cluster")
        sources = self._atlas.get("sources") if isinstance(self._atlas.get("sources"), list) else []
        services = [src.get("name") for src in sources if isinstance(src, dict)]
        parts: list[str] = []
        if cluster:
            parts.append(f"Cluster: {cluster}.")
        if services:
            parts.append(f"Services indexed: {len(services)}.")
        if isinstance(self._atlas, dict):
            keys = [key for key in self._atlas.keys() if key not in {"sources"}]
            if keys:
                parts.append(f"Atlas keys: {', '.join(sorted(keys)[:8])}.")
        return " ".join(parts)

    def runbook_titles(self, *, limit: int = 5) -> str:
        self.load()
        if not self._runbooks:
            return ""
        titles = []
        for entry in self._runbooks:
            if not isinstance(entry, dict):
                continue
            title = entry.get("title")
            path = entry.get("path")
            if title and path:
                titles.append(f"- {title} ({path})")
        if not titles:
            return ""
        return "Relevant runbooks:\n" + "\n".join(titles[:limit])

    def runbook_paths(self, *, limit: int = 10) -> list[str]:
        self.load()
        if not self._runbooks:
            return []
        paths: list[str] = []
        for entry in self._runbooks:
            if not isinstance(entry, dict):
                continue
            path = entry.get("path")
            if path:
                paths.append(str(path))
        return paths[:limit]

    def chunk_lines(self, *, max_files: int = 20, max_chars: int = 6000) -> list[str]:
        self.load()
        lines: list[str] = []
        if not self._base:
            return lines
        summary = self.summary()
        if summary:
            lines.append(f"KB Summary: {summary}")
        # Prefer curated catalog JSON if present.
        if self._atlas:
            try:
                atlas_json = json.dumps(self._atlas, indent=2)
                lines.append("KB: atlas.json")
                lines.extend(atlas_json.splitlines())
            except Exception:
                pass
        if self._runbooks:
            lines.append("KB: runbooks.json")
            for entry in self._runbooks:
                if not isinstance(entry, dict):
                    continue
                title = entry.get("title")
                path = entry.get("path")
                if title and path:
                    lines.append(f"- {title} ({path})")
        # Include markdown/text sources as additional chunks.
        if len(lines) >= max_chars:
            return lines
        files = sorted(self._base.rglob("*.md")) + sorted(self._base.rglob("*.txt"))
        for path in files:
            if len(lines) >= max_chars:
                break
            if len(lines) > max_files * 50:
                break
            try:
                text = path.read_text(encoding="utf-8", errors="ignore")
            except Exception:
                continue
            if not text:
                continue
            lines.append(f"KB File: {path.relative_to(self._base)}")
            lines.extend(text.splitlines())
            if sum(len(line) for line in lines) >= max_chars:
                break
        return lines