From 0d5e19e11adb38fcb5a042a08306675cb4439885 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 18:50:23 -0300 Subject: [PATCH] atlasbot: infer worker expected count from metrics --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 33 +++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 802021f..b7843ab 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-16 + checksum/atlasbot-configmap: manual-atlasbot-17 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7153723..bd40a9f 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -191,6 +191,7 @@ _NODE_CLASS_AMD64: set[str] = set() _NODE_CLASS_JETSON: set[str] = set() _NODE_CLASS_EXTERNAL: set[str] = set() _NODE_CLASS_NON_RPI: set[str] = set() +NODE_REGEX = re.compile(r'node=~"([^"]+)"') def _load_json_file(path: str) -> Any | None: try: @@ -735,6 +736,23 @@ def expected_nodes_from_kb() -> set[str]: nodes = set().union(*_NODE_CLASS_INDEX.values()) return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL} +def expected_worker_nodes_from_metrics() -> list[str]: + for entry in _METRIC_INDEX: + panel = (entry.get("panel_title") or "").lower() + if "worker nodes ready" not in panel: + continue + exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else [] + for expr in exprs: + if not isinstance(expr, str): + continue + match = NODE_REGEX.search(expr) + if not match: + continue + raw = match.group(1) + nodes = [n.strip() for n in raw.split("|") if n.strip()] + return sorted(nodes) + return [] + def missing_nodes_answer(cluster_name: str) -> str: expected = expected_nodes_from_kb() if not expected: @@ -1098,6 +1116,8 @@ def sync_loop(token: str, room_id: str): total = len(ready_nodes) + len(not_ready_nodes) if total: missing_hint = missing_nodes_answer("Atlas") + expected_workers = expected_worker_nodes_from_metrics() + expected_total = len(expected_workers) if expected_workers else 0 if any(word in lower_body for word in ("ready", "not ready", "unready")): if not_ready_nodes: send_msg( @@ -1107,7 +1127,11 @@ def sync_loop(token: str, room_id: str): ) else: msg = f"All {len(ready_nodes)} worker nodes are Ready." - if missing_hint and "no missing" not in missing_hint: + if expected_total and len(ready_nodes) != expected_total: + missing = sorted(set(expected_workers) - set(ready_nodes)) + if missing: + msg += f" Missing: {', '.join(missing)}." + elif missing_hint and "no missing" not in missing_hint: msg += f" {missing_hint}" send_msg(token, rid, msg) continue @@ -1116,7 +1140,12 @@ def sync_loop(token: str, room_id: str): f"Atlas has {total} worker nodes; " f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady." ) - if missing_hint and "no missing" not in missing_hint: + if expected_total: + msg += f" Grafana inventory expects {expected_total} workers." + missing = sorted(set(expected_workers) - set(ready_nodes)) + if missing: + msg += f" Missing: {', '.join(missing)}." + elif missing_hint and "no missing" not in missing_hint: msg += f" {missing_hint}" elif "should" in lower_body: msg += " I don’t have an expected worker inventory in the KB; this is the current cluster state."