atlasbot: enrich snapshot facts and pod metrics

2026-01-27 12:53:17 -03:00 · 2026-01-27 12:53:17 -03:00 · b7f454b790
commit b7f454b790
parent 41b131c347
2 changed files with 51 additions and 7 deletions
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@ -16,7 +16,7 @@ spec:
      labels:
        app: atlasbot
      annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-35
+        checksum/atlasbot-configmap: manual-atlasbot-36
        vault.hashicorp.com/agent-inject: "true"
        vault.hashicorp.com/role: "comms"
        vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -95,6 +95,8 @@ METRIC_HINT_WORDS = {
    "pending",
    "unreachable",
    "latency",
    "pod",
    "pods",
 }
 CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
@ -116,6 +118,7 @@ METRIC_HINTS = {
    "net": ("net", "network", "bandwidth", "throughput"),
    "io": ("io", "disk", "storage"),
    "connections": ("connections", "conn", "postgres", "database", "db"),
    "pods": ("pods", "pod"),
 }
 _OLLAMA_LOCK = threading.Lock()
@ -488,13 +491,15 @@ def _metric_expr_uses_percent(entry: dict[str, Any]) -> bool:
    return "* 100" in expr or "*100" in expr
-def _format_metric_value(value: str, *, percent: bool) -> str:
+def _format_metric_value(value: str, *, percent: bool, rate: bool = False) -> str:
    try:
        num = float(value)
    except (TypeError, ValueError):
        return value
    if percent:
        return f"{num:.1f}%"
    if rate:
        return _humanize_rate(value, unit="rate")
    if abs(num) >= 1:
        return f"{num:.2f}".rstrip("0").rstrip(".")
    return f"{num:.4f}".rstrip("0").rstrip(".")
@ -779,6 +784,11 @@ def facts_context(
    lines: list[str] = ["Facts (live snapshot):"]
    if total is not None:
        lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}")
    if isinstance(summary, dict):
        by_arch_counts = summary.get("by_arch")
        if isinstance(by_arch_counts, dict) and by_arch_counts:
            parts = [f"{arch}={count}" for arch, count in sorted(by_arch_counts.items())]
            lines.append(f"- nodes_by_arch: {', '.join(parts)}")
    if not_ready_names:
        lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}")
    for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
@ -799,7 +809,7 @@ def facts_context(
        lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}")
        if not_ready_workers:
            lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}")
-    if expected_workers:
+    if expected_workers and any(word in normalize_query(prompt) for word in ("missing", "expected", "should", "not ready", "unready")):
        missing = sorted(
            set(expected_workers)
            - {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")}
@ -814,7 +824,11 @@ def facts_context(
        node = entry.get("node")
        value = entry.get("value")
        if node and value is not None:
-            value_fmt = _format_metric_value(str(value), percent=key in ("cpu", "ram"))
+            value_fmt = _format_metric_value(
                str(value),
                percent=key in ("cpu", "ram"),
                rate=key in ("net", "io"),
            )
            lines.append(f"- hottest_{key}: {node} ({value_fmt})")
    postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
@ -829,6 +843,11 @@ def facts_context(
                f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})"
            )
    for key in ("pods_running", "pods_pending", "pods_failed", "pods_succeeded"):
        value = metrics.get(key)
        if value is not None:
            lines.append(f"- {key}: {value}")
    usage_table = _node_usage_table(metrics)
    if usage_table:
        lines.append("- node_usage (cpu/ram/net/io):")
@ -838,8 +857,16 @@ def facts_context(
                continue
            cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else ""
            ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else ""
-            net = _format_metric_value(str(entry.get("net")), percent=False) if entry.get("net") is not None else ""
+            net = (
-            io_val = _format_metric_value(str(entry.get("io")), percent=False) if entry.get("io") is not None else ""
+                _format_metric_value(str(entry.get("net")), percent=False, rate=True)
                if entry.get("net") is not None
                else ""
            )
            io_val = (
                _format_metric_value(str(entry.get("io")), percent=False, rate=True)
                if entry.get("io") is not None
                else ""
            )
            lines.append(f"  - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}")
    if nodes_in_query:
@ -1029,7 +1056,7 @@ def snapshot_metric_answer(
        if top:
            node, val = top
            percent = metric in {"cpu", "ram"}
-            value = _format_metric_value(str(val), percent=percent)
+            value = _format_metric_value(str(val), percent=percent, rate=metric in {"net", "io"})
            scope = ""
            if include_hw:
                scope = f" among {' and '.join(sorted(include_hw))}"
@ -1051,6 +1078,23 @@ def snapshot_metric_answer(
        if parts:
            return _format_confidence(" ".join(parts), "high")
    if metric == "pods":
        running = metrics.get("pods_running")
        pending = metrics.get("pods_pending")
        failed = metrics.get("pods_failed")
        succeeded = metrics.get("pods_succeeded")
        parts = []
        if running is not None:
            parts.append(f"running {running:.0f}")
        if pending is not None:
            parts.append(f"pending {pending:.0f}")
        if failed is not None:
            parts.append(f"failed {failed:.0f}")
        if succeeded is not None:
            parts.append(f"succeeded {succeeded:.0f}")
        if parts:
            return _format_confidence(f"Pods: {', '.join(parts)}.", "high")
    return ""
 def structured_answer(