atlasbot: infer worker expected count from metrics

This commit is contained in:
Brad Stein 2026-01-26 18:50:23 -03:00
parent dfa13e22cc
commit 0d5e19e11a
2 changed files with 32 additions and 3 deletions

View File

@ -16,7 +16,7 @@ spec:
labels:
app: atlasbot
annotations:
checksum/atlasbot-configmap: manual-atlasbot-16
checksum/atlasbot-configmap: manual-atlasbot-17
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "comms"
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

View File

@ -191,6 +191,7 @@ _NODE_CLASS_AMD64: set[str] = set()
_NODE_CLASS_JETSON: set[str] = set()
_NODE_CLASS_EXTERNAL: set[str] = set()
_NODE_CLASS_NON_RPI: set[str] = set()
NODE_REGEX = re.compile(r'node=~"([^"]+)"')
def _load_json_file(path: str) -> Any | None:
try:
@ -735,6 +736,23 @@ def expected_nodes_from_kb() -> set[str]:
nodes = set().union(*_NODE_CLASS_INDEX.values())
return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL}
def expected_worker_nodes_from_metrics() -> list[str]:
for entry in _METRIC_INDEX:
panel = (entry.get("panel_title") or "").lower()
if "worker nodes ready" not in panel:
continue
exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else []
for expr in exprs:
if not isinstance(expr, str):
continue
match = NODE_REGEX.search(expr)
if not match:
continue
raw = match.group(1)
nodes = [n.strip() for n in raw.split("|") if n.strip()]
return sorted(nodes)
return []
def missing_nodes_answer(cluster_name: str) -> str:
expected = expected_nodes_from_kb()
if not expected:
@ -1098,6 +1116,8 @@ def sync_loop(token: str, room_id: str):
total = len(ready_nodes) + len(not_ready_nodes)
if total:
missing_hint = missing_nodes_answer("Atlas")
expected_workers = expected_worker_nodes_from_metrics()
expected_total = len(expected_workers) if expected_workers else 0
if any(word in lower_body for word in ("ready", "not ready", "unready")):
if not_ready_nodes:
send_msg(
@ -1107,7 +1127,11 @@ def sync_loop(token: str, room_id: str):
)
else:
msg = f"All {len(ready_nodes)} worker nodes are Ready."
if missing_hint and "no missing" not in missing_hint:
if expected_total and len(ready_nodes) != expected_total:
missing = sorted(set(expected_workers) - set(ready_nodes))
if missing:
msg += f" Missing: {', '.join(missing)}."
elif missing_hint and "no missing" not in missing_hint:
msg += f" {missing_hint}"
send_msg(token, rid, msg)
continue
@ -1116,7 +1140,12 @@ def sync_loop(token: str, room_id: str):
f"Atlas has {total} worker nodes; "
f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady."
)
if missing_hint and "no missing" not in missing_hint:
if expected_total:
msg += f" Grafana inventory expects {expected_total} workers."
missing = sorted(set(expected_workers) - set(ready_nodes))
if missing:
msg += f" Missing: {', '.join(missing)}."
elif missing_hint and "no missing" not in missing_hint:
msg += f" {missing_hint}"
elif "should" in lower_body:
msg += " I dont have an expected worker inventory in the KB; this is the current cluster state."