atlasbot: enrich snapshot facts and pod metrics

This commit is contained in:
Brad Stein 2026-01-27 12:53:17 -03:00
parent 41b131c347
commit b7f454b790
2 changed files with 51 additions and 7 deletions

View File

@ -16,7 +16,7 @@ spec:
labels: labels:
app: atlasbot app: atlasbot
annotations: annotations:
checksum/atlasbot-configmap: manual-atlasbot-35 checksum/atlasbot-configmap: manual-atlasbot-36
vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "comms" vault.hashicorp.com/role: "comms"
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

View File

@ -95,6 +95,8 @@ METRIC_HINT_WORDS = {
"pending", "pending",
"unreachable", "unreachable",
"latency", "latency",
"pod",
"pods",
} }
CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
@ -116,6 +118,7 @@ METRIC_HINTS = {
"net": ("net", "network", "bandwidth", "throughput"), "net": ("net", "network", "bandwidth", "throughput"),
"io": ("io", "disk", "storage"), "io": ("io", "disk", "storage"),
"connections": ("connections", "conn", "postgres", "database", "db"), "connections": ("connections", "conn", "postgres", "database", "db"),
"pods": ("pods", "pod"),
} }
_OLLAMA_LOCK = threading.Lock() _OLLAMA_LOCK = threading.Lock()
@ -488,13 +491,15 @@ def _metric_expr_uses_percent(entry: dict[str, Any]) -> bool:
return "* 100" in expr or "*100" in expr return "* 100" in expr or "*100" in expr
def _format_metric_value(value: str, *, percent: bool) -> str: def _format_metric_value(value: str, *, percent: bool, rate: bool = False) -> str:
try: try:
num = float(value) num = float(value)
except (TypeError, ValueError): except (TypeError, ValueError):
return value return value
if percent: if percent:
return f"{num:.1f}%" return f"{num:.1f}%"
if rate:
return _humanize_rate(value, unit="rate")
if abs(num) >= 1: if abs(num) >= 1:
return f"{num:.2f}".rstrip("0").rstrip(".") return f"{num:.2f}".rstrip("0").rstrip(".")
return f"{num:.4f}".rstrip("0").rstrip(".") return f"{num:.4f}".rstrip("0").rstrip(".")
@ -779,6 +784,11 @@ def facts_context(
lines: list[str] = ["Facts (live snapshot):"] lines: list[str] = ["Facts (live snapshot):"]
if total is not None: if total is not None:
lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}") lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}")
if isinstance(summary, dict):
by_arch_counts = summary.get("by_arch")
if isinstance(by_arch_counts, dict) and by_arch_counts:
parts = [f"{arch}={count}" for arch, count in sorted(by_arch_counts.items())]
lines.append(f"- nodes_by_arch: {', '.join(parts)}")
if not_ready_names: if not_ready_names:
lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}") lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}")
for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
@ -799,7 +809,7 @@ def facts_context(
lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}") lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}")
if not_ready_workers: if not_ready_workers:
lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}") lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}")
if expected_workers: if expected_workers and any(word in normalize_query(prompt) for word in ("missing", "expected", "should", "not ready", "unready")):
missing = sorted( missing = sorted(
set(expected_workers) set(expected_workers)
- {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")} - {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")}
@ -814,7 +824,11 @@ def facts_context(
node = entry.get("node") node = entry.get("node")
value = entry.get("value") value = entry.get("value")
if node and value is not None: if node and value is not None:
value_fmt = _format_metric_value(str(value), percent=key in ("cpu", "ram")) value_fmt = _format_metric_value(
str(value),
percent=key in ("cpu", "ram"),
rate=key in ("net", "io"),
)
lines.append(f"- hottest_{key}: {node} ({value_fmt})") lines.append(f"- hottest_{key}: {node} ({value_fmt})")
postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
@ -829,6 +843,11 @@ def facts_context(
f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})" f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})"
) )
for key in ("pods_running", "pods_pending", "pods_failed", "pods_succeeded"):
value = metrics.get(key)
if value is not None:
lines.append(f"- {key}: {value}")
usage_table = _node_usage_table(metrics) usage_table = _node_usage_table(metrics)
if usage_table: if usage_table:
lines.append("- node_usage (cpu/ram/net/io):") lines.append("- node_usage (cpu/ram/net/io):")
@ -838,8 +857,16 @@ def facts_context(
continue continue
cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else "" cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else ""
ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else "" ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else ""
net = _format_metric_value(str(entry.get("net")), percent=False) if entry.get("net") is not None else "" net = (
io_val = _format_metric_value(str(entry.get("io")), percent=False) if entry.get("io") is not None else "" _format_metric_value(str(entry.get("net")), percent=False, rate=True)
if entry.get("net") is not None
else ""
)
io_val = (
_format_metric_value(str(entry.get("io")), percent=False, rate=True)
if entry.get("io") is not None
else ""
)
lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}") lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}")
if nodes_in_query: if nodes_in_query:
@ -1029,7 +1056,7 @@ def snapshot_metric_answer(
if top: if top:
node, val = top node, val = top
percent = metric in {"cpu", "ram"} percent = metric in {"cpu", "ram"}
value = _format_metric_value(str(val), percent=percent) value = _format_metric_value(str(val), percent=percent, rate=metric in {"net", "io"})
scope = "" scope = ""
if include_hw: if include_hw:
scope = f" among {' and '.join(sorted(include_hw))}" scope = f" among {' and '.join(sorted(include_hw))}"
@ -1051,6 +1078,23 @@ def snapshot_metric_answer(
if parts: if parts:
return _format_confidence(" ".join(parts), "high") return _format_confidence(" ".join(parts), "high")
if metric == "pods":
running = metrics.get("pods_running")
pending = metrics.get("pods_pending")
failed = metrics.get("pods_failed")
succeeded = metrics.get("pods_succeeded")
parts = []
if running is not None:
parts.append(f"running {running:.0f}")
if pending is not None:
parts.append(f"pending {pending:.0f}")
if failed is not None:
parts.append(f"failed {failed:.0f}")
if succeeded is not None:
parts.append(f"succeeded {succeeded:.0f}")
if parts:
return _format_confidence(f"Pods: {', '.join(parts)}.", "high")
return "" return ""
def structured_answer( def structured_answer(