atlasbot: improve node inventory reasoning

This commit is contained in:
Brad Stein 2026-01-26 19:53:11 -03:00
parent a61091c052
commit b27c80d5c0
2 changed files with 122 additions and 34 deletions

View File

@ -16,7 +16,7 @@ spec:
labels:
app: atlasbot
annotations:
checksum/atlasbot-configmap: manual-atlasbot-21
checksum/atlasbot-configmap: manual-atlasbot-22
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "comms"
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

View File

@ -89,9 +89,17 @@ METRIC_HINT_WORDS = {
"latency",
}
CODE_FENCE_RE = re.compile(r"^```(?:json)?\\s*(.*?)\\s*```$", re.DOTALL)
TITAN_NODE_RE = re.compile(r"\\btitan-[0-9a-z]{2}\\b", re.IGNORECASE)
TITAN_RANGE_RE = re.compile(r"\\btitan-([0-9a-z]{2})/([0-9a-z]{2})\\b", re.IGNORECASE)
CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
_DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
def normalize_query(text: str) -> str:
cleaned = (text or "").lower()
for ch in _DASH_CHARS:
cleaned = cleaned.replace(ch, "-")
cleaned = re.sub(r"\s+", " ", cleaned).strip()
return cleaned
def _tokens(text: str) -> list[str]:
toks = [t.lower() for t in TOKEN_RE.findall(text or "")]
@ -267,14 +275,15 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str:
return "\n".join(parts).strip()
def _extract_titan_nodes(text: str) -> list[str]:
names = {n.lower() for n in TITAN_NODE_RE.findall(text or "") if n}
for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", text or "", re.IGNORECASE):
cleaned = normalize_query(text)
names = {n.lower() for n in TITAN_NODE_RE.findall(cleaned) if n}
for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", cleaned, re.IGNORECASE):
tail = match.group(1)
for part in re.split(r"[/,]", tail):
part = part.strip()
if part:
names.add(f"titan-{part.lower()}")
for match in TITAN_RANGE_RE.finditer(text or ""):
for match in TITAN_RANGE_RE.finditer(cleaned):
left, right = match.groups()
if left:
names.add(f"titan-{left.lower()}")
@ -323,6 +332,7 @@ def node_inventory_live() -> list[dict[str, Any]]:
"arch": labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "",
"hardware": _hardware_class(labels),
"roles": _node_roles(labels),
"is_worker": _node_is_worker(node),
"ready": _node_ready_status(node),
}
)
@ -335,7 +345,7 @@ def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]:
return {k: sorted(v) for k, v in grouped.items()}
def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = None) -> str:
q = (query or "").lower()
q = normalize_query(query)
if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")):
return ""
if inventory is None:
@ -372,7 +382,7 @@ def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None =
return "\n".join(lines)
def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]:
q = (prompt or "").lower()
q = normalize_query(prompt)
if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")):
return node_inventory_live()
return []
@ -382,10 +392,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
ready = [node["name"] for node in inventory if node.get("ready") is True]
not_ready = [node["name"] for node in inventory if node.get("ready") is False]
groups = _group_nodes(inventory)
workers = [node for node in inventory if "worker" in (node.get("roles") or [])]
workers = [node for node in inventory if node.get("is_worker") is True]
worker_names = [node["name"] for node in workers]
worker_ready = [node["name"] for node in workers if node.get("ready") is True]
worker_not_ready = [node["name"] for node in workers if node.get("ready") is False]
expected_workers = expected_worker_nodes_from_metrics()
expected_ready = [n for n in expected_workers if n in ready] if expected_workers else []
expected_not_ready = [n for n in expected_workers if n in not_ready] if expected_workers else []
expected_missing = [n for n in expected_workers if n not in names] if expected_workers else []
return {
"names": sorted(names),
"ready": sorted(ready),
@ -394,10 +408,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
"worker_names": sorted(worker_names),
"worker_ready": sorted(worker_ready),
"worker_not_ready": sorted(worker_not_ready),
"expected_workers": expected_workers,
"expected_ready": sorted(expected_ready),
"expected_not_ready": sorted(expected_not_ready),
"expected_missing": sorted(expected_missing),
}
def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str:
q = (prompt or "").lower()
q = normalize_query(prompt)
if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")):
return metrics_summary
@ -412,29 +430,75 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
worker_names = sets["worker_names"]
worker_ready = sets["worker_ready"]
worker_not_ready = sets["worker_not_ready"]
expected_workers = sets["expected_workers"]
expected_ready = sets["expected_ready"]
expected_not_ready = sets["expected_not_ready"]
expected_missing = sets["expected_missing"]
total = len(names)
nodes_in_query = _extract_titan_nodes(q)
rpi_nodes = set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))
non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", []))
unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))
for node in _extract_titan_nodes(q):
if node and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q):
if nodes_in_query and ("raspberry" in q or "rpi" in q):
parts: list[str] = []
for node in nodes_in_query:
if node in rpi_nodes:
parts.append(f"{node} is a Raspberry Pi node.")
elif node in non_rpi:
parts.append(f"{node} is not a Raspberry Pi node.")
elif node in names:
parts.append(f"{node} is in Atlas but hardware is unknown.")
else:
parts.append(f"{node} is not in the Atlas cluster.")
return " ".join(parts)
if nodes_in_query and "jetson" in q:
jets = set(groups.get("jetson", []))
parts = []
for node in nodes_in_query:
if node in jets:
parts.append(f"{node} is a Jetson node.")
elif node in names:
parts.append(f"{node} is not a Jetson node.")
else:
parts.append(f"{node} is not in the Atlas cluster.")
return " ".join(parts)
if nodes_in_query and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q or "present" in q or "exist" in q):
parts: list[str] = []
for node in nodes_in_query:
if node in names:
return f"Yes. {node} is in the Atlas cluster."
return f"No. {node} is not in the Atlas cluster."
parts.append(f"Yes. {node} is in the Atlas cluster.")
else:
parts.append(f"No. {node} is not in the Atlas cluster.")
return " ".join(parts)
if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q:
non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", [])))
if "besides" in q:
amd = groups.get("amd64", [])
if any(term in q for term in ("non-raspberry", "non raspberry", "not raspberry", "non-rpi", "non rpi")):
non_rpi_sorted = sorted(non_rpi)
if any(word in q for word in ("how many", "count", "number")):
return f"Atlas has {len(non_rpi_sorted)} nonRaspberry Pi nodes."
if any(phrase in q for phrase in ("besides jetson", "excluding jetson", "without jetson", "non jetson")):
amd = sorted(groups.get("amd64", []))
return f"NonRaspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No nonRaspberry Pi nodes outside Jetson."
return f"NonRaspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No nonRaspberry Pi nodes found."
return f"NonRaspberry Pi nodes: {', '.join(non_rpi_sorted)}." if non_rpi_sorted else "No nonRaspberry Pi nodes found."
if "jetson" in q:
jets = groups.get("jetson", [])
if any(word in q for word in ("how many", "count", "number")):
return f"Atlas has {len(jets)} Jetson nodes."
return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found."
if "amd64" in q or "x86" in q:
amd = groups.get("amd64", [])
if any(word in q for word in ("how many", "count", "number")):
return f"Atlas has {len(amd)} amd64 nodes."
return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found."
if "arm64" in q and "node" in q and any(word in q for word in ("how many", "count", "number")):
count = sum(1 for node in inventory if node.get("arch") == "arm64")
return f"Atlas has {count} arm64 nodes."
if "rpi4" in q:
rpi4 = groups.get("rpi4", [])
if any(word in q for word in ("how many", "count", "number")):
@ -448,28 +512,52 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found."
if "raspberry" in q or "rpi" in q:
rpi = sorted(set(groups.get("rpi4", [])) | set(groups.get("rpi5", [])))
rpi = sorted(rpi_nodes)
if any(word in q for word in ("how many", "count", "number")):
return f"Atlas has {len(rpi)} Raspberry Pi nodes."
return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found."
if "arm64-unknown" in q or "unknown" in q or "no hardware" in q:
unknown = sorted(set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", [])))
unknown = sorted(unknown_hw)
return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels."
if "worker" in q and "node" in q:
if any(word in q for word in ("missing", "expected", "should")):
expected_workers = expected_worker_nodes_from_metrics()
missing = sorted(set(expected_workers) - set(worker_ready + worker_not_ready)) if expected_workers else []
if "missing" in q and missing:
return "Missing worker nodes: " + ", ".join(missing) + "."
if expected_workers:
msg = f"Grafana inventory expects {len(expected_workers)} workers."
if missing:
msg += f" Missing: {', '.join(missing)}."
if ("notready" in q or "not ready" in q or "unready" in q) and ("node" in q or "nodes" in q):
return "Not ready nodes: " + (", ".join(not_ready) if not_ready else "none") + "."
if "worker" in q and ("node" in q or "nodes" in q or "workers" in q):
not_ready_query = "not ready" in q or "unready" in q or "down" in q or ("not" in q and "ready" in q)
if expected_workers:
if "missing" in q:
return "Missing worker nodes: " + (", ".join(expected_missing) if expected_missing else "none") + "."
if "ready" in q and ("not ready" in q or "vs" in q or "versus" in q):
return (
f"Expected workers: {len(expected_ready)} ready, "
f"{len(expected_not_ready)} not ready (expected {len(expected_workers)})."
)
if any(word in q for word in ("how many", "count", "number")) and ("expect" in q or "expected" in q or "should" in q):
msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
if expected_missing:
msg += f" Missing: {', '.join(expected_missing)}."
return msg
return "No expected worker inventory found; using live cluster state."
if "not ready" in q or "unready" in q or "down" in q:
if not_ready_query:
if expected_not_ready or expected_missing:
detail = []
if expected_not_ready:
detail.append(f"Not ready: {', '.join(expected_not_ready)}")
if expected_missing:
detail.append(f"Missing: {', '.join(expected_missing)}")
return "Worker nodes needing attention. " + " ".join(detail) + "."
return "All expected worker nodes are Ready."
if any(word in q for word in ("expected", "expect", "should")):
msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
if expected_missing:
msg += f" Missing: {', '.join(expected_missing)}."
return msg
if any(word in q for word in ("how many", "count", "number")):
return f"Worker nodes: {len(expected_ready)} ready, {len(expected_not_ready)} not ready (expected {len(expected_workers)})."
if "ready" in q:
return f"Ready worker nodes ({len(expected_ready)}): {', '.join(expected_ready)}."
if not_ready_query:
return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "."
if any(word in q for word in ("how many", "count", "number")):
return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready."