diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index b3e617d..fd2f399 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-35 + checksum/atlasbot-configmap: manual-atlasbot-36 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index c790f5c..0330620 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -95,6 +95,8 @@ METRIC_HINT_WORDS = { "pending", "unreachable", "latency", + "pod", + "pods", } CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) @@ -116,6 +118,7 @@ METRIC_HINTS = { "net": ("net", "network", "bandwidth", "throughput"), "io": ("io", "disk", "storage"), "connections": ("connections", "conn", "postgres", "database", "db"), + "pods": ("pods", "pod"), } _OLLAMA_LOCK = threading.Lock() @@ -488,13 +491,15 @@ def _metric_expr_uses_percent(entry: dict[str, Any]) -> bool: return "* 100" in expr or "*100" in expr -def _format_metric_value(value: str, *, percent: bool) -> str: +def _format_metric_value(value: str, *, percent: bool, rate: bool = False) -> str: try: num = float(value) except (TypeError, ValueError): return value if percent: return f"{num:.1f}%" + if rate: + return _humanize_rate(value, unit="rate") if abs(num) >= 1: return f"{num:.2f}".rstrip("0").rstrip(".") return f"{num:.4f}".rstrip("0").rstrip(".") @@ -779,6 +784,11 @@ def facts_context( lines: list[str] = ["Facts (live snapshot):"] if total is not None: lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}") + if isinstance(summary, dict): + by_arch_counts = summary.get("by_arch") + if isinstance(by_arch_counts, dict) and by_arch_counts: + parts = [f"{arch}={count}" for arch, count in sorted(by_arch_counts.items())] + lines.append(f"- nodes_by_arch: {', '.join(parts)}") if not_ready_names: lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}") for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): @@ -799,7 +809,7 @@ def facts_context( lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}") if not_ready_workers: lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}") - if expected_workers: + if expected_workers and any(word in normalize_query(prompt) for word in ("missing", "expected", "should", "not ready", "unready")): missing = sorted( set(expected_workers) - {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")} @@ -814,7 +824,11 @@ def facts_context( node = entry.get("node") value = entry.get("value") if node and value is not None: - value_fmt = _format_metric_value(str(value), percent=key in ("cpu", "ram")) + value_fmt = _format_metric_value( + str(value), + percent=key in ("cpu", "ram"), + rate=key in ("net", "io"), + ) lines.append(f"- hottest_{key}: {node} ({value_fmt})") postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} @@ -829,6 +843,11 @@ def facts_context( f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})" ) + for key in ("pods_running", "pods_pending", "pods_failed", "pods_succeeded"): + value = metrics.get(key) + if value is not None: + lines.append(f"- {key}: {value}") + usage_table = _node_usage_table(metrics) if usage_table: lines.append("- node_usage (cpu/ram/net/io):") @@ -838,8 +857,16 @@ def facts_context( continue cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else "" ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else "" - net = _format_metric_value(str(entry.get("net")), percent=False) if entry.get("net") is not None else "" - io_val = _format_metric_value(str(entry.get("io")), percent=False) if entry.get("io") is not None else "" + net = ( + _format_metric_value(str(entry.get("net")), percent=False, rate=True) + if entry.get("net") is not None + else "" + ) + io_val = ( + _format_metric_value(str(entry.get("io")), percent=False, rate=True) + if entry.get("io") is not None + else "" + ) lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}") if nodes_in_query: @@ -1029,7 +1056,7 @@ def snapshot_metric_answer( if top: node, val = top percent = metric in {"cpu", "ram"} - value = _format_metric_value(str(val), percent=percent) + value = _format_metric_value(str(val), percent=percent, rate=metric in {"net", "io"}) scope = "" if include_hw: scope = f" among {' and '.join(sorted(include_hw))}" @@ -1051,6 +1078,23 @@ def snapshot_metric_answer( if parts: return _format_confidence(" ".join(parts), "high") + if metric == "pods": + running = metrics.get("pods_running") + pending = metrics.get("pods_pending") + failed = metrics.get("pods_failed") + succeeded = metrics.get("pods_succeeded") + parts = [] + if running is not None: + parts.append(f"running {running:.0f}") + if pending is not None: + parts.append(f"pending {pending:.0f}") + if failed is not None: + parts.append(f"failed {failed:.0f}") + if succeeded is not None: + parts.append(f"succeeded {succeeded:.0f}") + if parts: + return _format_confidence(f"Pods: {', '.join(parts)}.", "high") + return "" def structured_answer(