atlasbot: infer worker expected count from metrics
This commit is contained in:
parent
dfa13e22cc
commit
0d5e19e11a
@ -16,7 +16,7 @@ spec:
|
||||
labels:
|
||||
app: atlasbot
|
||||
annotations:
|
||||
checksum/atlasbot-configmap: manual-atlasbot-16
|
||||
checksum/atlasbot-configmap: manual-atlasbot-17
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/role: "comms"
|
||||
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
|
||||
|
||||
@ -191,6 +191,7 @@ _NODE_CLASS_AMD64: set[str] = set()
|
||||
_NODE_CLASS_JETSON: set[str] = set()
|
||||
_NODE_CLASS_EXTERNAL: set[str] = set()
|
||||
_NODE_CLASS_NON_RPI: set[str] = set()
|
||||
NODE_REGEX = re.compile(r'node=~"([^"]+)"')
|
||||
|
||||
def _load_json_file(path: str) -> Any | None:
|
||||
try:
|
||||
@ -735,6 +736,23 @@ def expected_nodes_from_kb() -> set[str]:
|
||||
nodes = set().union(*_NODE_CLASS_INDEX.values())
|
||||
return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL}
|
||||
|
||||
def expected_worker_nodes_from_metrics() -> list[str]:
|
||||
for entry in _METRIC_INDEX:
|
||||
panel = (entry.get("panel_title") or "").lower()
|
||||
if "worker nodes ready" not in panel:
|
||||
continue
|
||||
exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else []
|
||||
for expr in exprs:
|
||||
if not isinstance(expr, str):
|
||||
continue
|
||||
match = NODE_REGEX.search(expr)
|
||||
if not match:
|
||||
continue
|
||||
raw = match.group(1)
|
||||
nodes = [n.strip() for n in raw.split("|") if n.strip()]
|
||||
return sorted(nodes)
|
||||
return []
|
||||
|
||||
def missing_nodes_answer(cluster_name: str) -> str:
|
||||
expected = expected_nodes_from_kb()
|
||||
if not expected:
|
||||
@ -1098,6 +1116,8 @@ def sync_loop(token: str, room_id: str):
|
||||
total = len(ready_nodes) + len(not_ready_nodes)
|
||||
if total:
|
||||
missing_hint = missing_nodes_answer("Atlas")
|
||||
expected_workers = expected_worker_nodes_from_metrics()
|
||||
expected_total = len(expected_workers) if expected_workers else 0
|
||||
if any(word in lower_body for word in ("ready", "not ready", "unready")):
|
||||
if not_ready_nodes:
|
||||
send_msg(
|
||||
@ -1107,7 +1127,11 @@ def sync_loop(token: str, room_id: str):
|
||||
)
|
||||
else:
|
||||
msg = f"All {len(ready_nodes)} worker nodes are Ready."
|
||||
if missing_hint and "no missing" not in missing_hint:
|
||||
if expected_total and len(ready_nodes) != expected_total:
|
||||
missing = sorted(set(expected_workers) - set(ready_nodes))
|
||||
if missing:
|
||||
msg += f" Missing: {', '.join(missing)}."
|
||||
elif missing_hint and "no missing" not in missing_hint:
|
||||
msg += f" {missing_hint}"
|
||||
send_msg(token, rid, msg)
|
||||
continue
|
||||
@ -1116,7 +1140,12 @@ def sync_loop(token: str, room_id: str):
|
||||
f"Atlas has {total} worker nodes; "
|
||||
f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady."
|
||||
)
|
||||
if missing_hint and "no missing" not in missing_hint:
|
||||
if expected_total:
|
||||
msg += f" Grafana inventory expects {expected_total} workers."
|
||||
missing = sorted(set(expected_workers) - set(ready_nodes))
|
||||
if missing:
|
||||
msg += f" Missing: {', '.join(missing)}."
|
||||
elif missing_hint and "no missing" not in missing_hint:
|
||||
msg += f" {missing_hint}"
|
||||
elif "should" in lower_body:
|
||||
msg += " I don’t have an expected worker inventory in the KB; this is the current cluster state."
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user