atlasbot: infer worker expected count from metrics
This commit is contained in:
parent
dfa13e22cc
commit
0d5e19e11a
@ -16,7 +16,7 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
app: atlasbot
|
app: atlasbot
|
||||||
annotations:
|
annotations:
|
||||||
checksum/atlasbot-configmap: manual-atlasbot-16
|
checksum/atlasbot-configmap: manual-atlasbot-17
|
||||||
vault.hashicorp.com/agent-inject: "true"
|
vault.hashicorp.com/agent-inject: "true"
|
||||||
vault.hashicorp.com/role: "comms"
|
vault.hashicorp.com/role: "comms"
|
||||||
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
|
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
|
||||||
|
|||||||
@ -191,6 +191,7 @@ _NODE_CLASS_AMD64: set[str] = set()
|
|||||||
_NODE_CLASS_JETSON: set[str] = set()
|
_NODE_CLASS_JETSON: set[str] = set()
|
||||||
_NODE_CLASS_EXTERNAL: set[str] = set()
|
_NODE_CLASS_EXTERNAL: set[str] = set()
|
||||||
_NODE_CLASS_NON_RPI: set[str] = set()
|
_NODE_CLASS_NON_RPI: set[str] = set()
|
||||||
|
NODE_REGEX = re.compile(r'node=~"([^"]+)"')
|
||||||
|
|
||||||
def _load_json_file(path: str) -> Any | None:
|
def _load_json_file(path: str) -> Any | None:
|
||||||
try:
|
try:
|
||||||
@ -735,6 +736,23 @@ def expected_nodes_from_kb() -> set[str]:
|
|||||||
nodes = set().union(*_NODE_CLASS_INDEX.values())
|
nodes = set().union(*_NODE_CLASS_INDEX.values())
|
||||||
return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL}
|
return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL}
|
||||||
|
|
||||||
|
def expected_worker_nodes_from_metrics() -> list[str]:
|
||||||
|
for entry in _METRIC_INDEX:
|
||||||
|
panel = (entry.get("panel_title") or "").lower()
|
||||||
|
if "worker nodes ready" not in panel:
|
||||||
|
continue
|
||||||
|
exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else []
|
||||||
|
for expr in exprs:
|
||||||
|
if not isinstance(expr, str):
|
||||||
|
continue
|
||||||
|
match = NODE_REGEX.search(expr)
|
||||||
|
if not match:
|
||||||
|
continue
|
||||||
|
raw = match.group(1)
|
||||||
|
nodes = [n.strip() for n in raw.split("|") if n.strip()]
|
||||||
|
return sorted(nodes)
|
||||||
|
return []
|
||||||
|
|
||||||
def missing_nodes_answer(cluster_name: str) -> str:
|
def missing_nodes_answer(cluster_name: str) -> str:
|
||||||
expected = expected_nodes_from_kb()
|
expected = expected_nodes_from_kb()
|
||||||
if not expected:
|
if not expected:
|
||||||
@ -1098,6 +1116,8 @@ def sync_loop(token: str, room_id: str):
|
|||||||
total = len(ready_nodes) + len(not_ready_nodes)
|
total = len(ready_nodes) + len(not_ready_nodes)
|
||||||
if total:
|
if total:
|
||||||
missing_hint = missing_nodes_answer("Atlas")
|
missing_hint = missing_nodes_answer("Atlas")
|
||||||
|
expected_workers = expected_worker_nodes_from_metrics()
|
||||||
|
expected_total = len(expected_workers) if expected_workers else 0
|
||||||
if any(word in lower_body for word in ("ready", "not ready", "unready")):
|
if any(word in lower_body for word in ("ready", "not ready", "unready")):
|
||||||
if not_ready_nodes:
|
if not_ready_nodes:
|
||||||
send_msg(
|
send_msg(
|
||||||
@ -1107,7 +1127,11 @@ def sync_loop(token: str, room_id: str):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg = f"All {len(ready_nodes)} worker nodes are Ready."
|
msg = f"All {len(ready_nodes)} worker nodes are Ready."
|
||||||
if missing_hint and "no missing" not in missing_hint:
|
if expected_total and len(ready_nodes) != expected_total:
|
||||||
|
missing = sorted(set(expected_workers) - set(ready_nodes))
|
||||||
|
if missing:
|
||||||
|
msg += f" Missing: {', '.join(missing)}."
|
||||||
|
elif missing_hint and "no missing" not in missing_hint:
|
||||||
msg += f" {missing_hint}"
|
msg += f" {missing_hint}"
|
||||||
send_msg(token, rid, msg)
|
send_msg(token, rid, msg)
|
||||||
continue
|
continue
|
||||||
@ -1116,7 +1140,12 @@ def sync_loop(token: str, room_id: str):
|
|||||||
f"Atlas has {total} worker nodes; "
|
f"Atlas has {total} worker nodes; "
|
||||||
f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady."
|
f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady."
|
||||||
)
|
)
|
||||||
if missing_hint and "no missing" not in missing_hint:
|
if expected_total:
|
||||||
|
msg += f" Grafana inventory expects {expected_total} workers."
|
||||||
|
missing = sorted(set(expected_workers) - set(ready_nodes))
|
||||||
|
if missing:
|
||||||
|
msg += f" Missing: {', '.join(missing)}."
|
||||||
|
elif missing_hint and "no missing" not in missing_hint:
|
||||||
msg += f" {missing_hint}"
|
msg += f" {missing_hint}"
|
||||||
elif "should" in lower_body:
|
elif "should" in lower_body:
|
||||||
msg += " I don’t have an expected worker inventory in the KB; this is the current cluster state."
|
msg += " I don’t have an expected worker inventory in the KB; this is the current cluster state."
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user