atlasbot: improve node inventory reasoning
This commit is contained in:
parent
a61091c052
commit
b27c80d5c0
@ -16,7 +16,7 @@ spec:
|
||||
labels:
|
||||
app: atlasbot
|
||||
annotations:
|
||||
checksum/atlasbot-configmap: manual-atlasbot-21
|
||||
checksum/atlasbot-configmap: manual-atlasbot-22
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/role: "comms"
|
||||
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
|
||||
|
||||
@ -89,9 +89,17 @@ METRIC_HINT_WORDS = {
|
||||
"latency",
|
||||
}
|
||||
|
||||
CODE_FENCE_RE = re.compile(r"^```(?:json)?\\s*(.*?)\\s*```$", re.DOTALL)
|
||||
TITAN_NODE_RE = re.compile(r"\\btitan-[0-9a-z]{2}\\b", re.IGNORECASE)
|
||||
TITAN_RANGE_RE = re.compile(r"\\btitan-([0-9a-z]{2})/([0-9a-z]{2})\\b", re.IGNORECASE)
|
||||
CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
|
||||
TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
|
||||
TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
|
||||
_DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
|
||||
|
||||
def normalize_query(text: str) -> str:
|
||||
cleaned = (text or "").lower()
|
||||
for ch in _DASH_CHARS:
|
||||
cleaned = cleaned.replace(ch, "-")
|
||||
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
||||
return cleaned
|
||||
|
||||
def _tokens(text: str) -> list[str]:
|
||||
toks = [t.lower() for t in TOKEN_RE.findall(text or "")]
|
||||
@ -267,14 +275,15 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str:
|
||||
return "\n".join(parts).strip()
|
||||
|
||||
def _extract_titan_nodes(text: str) -> list[str]:
|
||||
names = {n.lower() for n in TITAN_NODE_RE.findall(text or "") if n}
|
||||
for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", text or "", re.IGNORECASE):
|
||||
cleaned = normalize_query(text)
|
||||
names = {n.lower() for n in TITAN_NODE_RE.findall(cleaned) if n}
|
||||
for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", cleaned, re.IGNORECASE):
|
||||
tail = match.group(1)
|
||||
for part in re.split(r"[/,]", tail):
|
||||
part = part.strip()
|
||||
if part:
|
||||
names.add(f"titan-{part.lower()}")
|
||||
for match in TITAN_RANGE_RE.finditer(text or ""):
|
||||
for match in TITAN_RANGE_RE.finditer(cleaned):
|
||||
left, right = match.groups()
|
||||
if left:
|
||||
names.add(f"titan-{left.lower()}")
|
||||
@ -323,6 +332,7 @@ def node_inventory_live() -> list[dict[str, Any]]:
|
||||
"arch": labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "",
|
||||
"hardware": _hardware_class(labels),
|
||||
"roles": _node_roles(labels),
|
||||
"is_worker": _node_is_worker(node),
|
||||
"ready": _node_ready_status(node),
|
||||
}
|
||||
)
|
||||
@ -335,7 +345,7 @@ def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]:
|
||||
return {k: sorted(v) for k, v in grouped.items()}
|
||||
|
||||
def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = None) -> str:
|
||||
q = (query or "").lower()
|
||||
q = normalize_query(query)
|
||||
if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")):
|
||||
return ""
|
||||
if inventory is None:
|
||||
@ -372,7 +382,7 @@ def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None =
|
||||
return "\n".join(lines)
|
||||
|
||||
def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]:
|
||||
q = (prompt or "").lower()
|
||||
q = normalize_query(prompt)
|
||||
if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")):
|
||||
return node_inventory_live()
|
||||
return []
|
||||
@ -382,10 +392,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
ready = [node["name"] for node in inventory if node.get("ready") is True]
|
||||
not_ready = [node["name"] for node in inventory if node.get("ready") is False]
|
||||
groups = _group_nodes(inventory)
|
||||
workers = [node for node in inventory if "worker" in (node.get("roles") or [])]
|
||||
workers = [node for node in inventory if node.get("is_worker") is True]
|
||||
worker_names = [node["name"] for node in workers]
|
||||
worker_ready = [node["name"] for node in workers if node.get("ready") is True]
|
||||
worker_not_ready = [node["name"] for node in workers if node.get("ready") is False]
|
||||
expected_workers = expected_worker_nodes_from_metrics()
|
||||
expected_ready = [n for n in expected_workers if n in ready] if expected_workers else []
|
||||
expected_not_ready = [n for n in expected_workers if n in not_ready] if expected_workers else []
|
||||
expected_missing = [n for n in expected_workers if n not in names] if expected_workers else []
|
||||
return {
|
||||
"names": sorted(names),
|
||||
"ready": sorted(ready),
|
||||
@ -394,10 +408,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
"worker_names": sorted(worker_names),
|
||||
"worker_ready": sorted(worker_ready),
|
||||
"worker_not_ready": sorted(worker_not_ready),
|
||||
"expected_workers": expected_workers,
|
||||
"expected_ready": sorted(expected_ready),
|
||||
"expected_not_ready": sorted(expected_not_ready),
|
||||
"expected_missing": sorted(expected_missing),
|
||||
}
|
||||
|
||||
def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str:
|
||||
q = (prompt or "").lower()
|
||||
q = normalize_query(prompt)
|
||||
if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")):
|
||||
return metrics_summary
|
||||
|
||||
@ -412,29 +430,75 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
|
||||
worker_names = sets["worker_names"]
|
||||
worker_ready = sets["worker_ready"]
|
||||
worker_not_ready = sets["worker_not_ready"]
|
||||
expected_workers = sets["expected_workers"]
|
||||
expected_ready = sets["expected_ready"]
|
||||
expected_not_ready = sets["expected_not_ready"]
|
||||
expected_missing = sets["expected_missing"]
|
||||
total = len(names)
|
||||
nodes_in_query = _extract_titan_nodes(q)
|
||||
rpi_nodes = set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))
|
||||
non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", []))
|
||||
unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))
|
||||
|
||||
for node in _extract_titan_nodes(q):
|
||||
if node and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q):
|
||||
if nodes_in_query and ("raspberry" in q or "rpi" in q):
|
||||
parts: list[str] = []
|
||||
for node in nodes_in_query:
|
||||
if node in rpi_nodes:
|
||||
parts.append(f"{node} is a Raspberry Pi node.")
|
||||
elif node in non_rpi:
|
||||
parts.append(f"{node} is not a Raspberry Pi node.")
|
||||
elif node in names:
|
||||
parts.append(f"{node} is in Atlas but hardware is unknown.")
|
||||
else:
|
||||
parts.append(f"{node} is not in the Atlas cluster.")
|
||||
return " ".join(parts)
|
||||
|
||||
if nodes_in_query and "jetson" in q:
|
||||
jets = set(groups.get("jetson", []))
|
||||
parts = []
|
||||
for node in nodes_in_query:
|
||||
if node in jets:
|
||||
parts.append(f"{node} is a Jetson node.")
|
||||
elif node in names:
|
||||
parts.append(f"{node} is not a Jetson node.")
|
||||
else:
|
||||
parts.append(f"{node} is not in the Atlas cluster.")
|
||||
return " ".join(parts)
|
||||
|
||||
if nodes_in_query and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q or "present" in q or "exist" in q):
|
||||
parts: list[str] = []
|
||||
for node in nodes_in_query:
|
||||
if node in names:
|
||||
return f"Yes. {node} is in the Atlas cluster."
|
||||
return f"No. {node} is not in the Atlas cluster."
|
||||
parts.append(f"Yes. {node} is in the Atlas cluster.")
|
||||
else:
|
||||
parts.append(f"No. {node} is not in the Atlas cluster.")
|
||||
return " ".join(parts)
|
||||
|
||||
if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q:
|
||||
non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", [])))
|
||||
if "besides" in q:
|
||||
amd = groups.get("amd64", [])
|
||||
if any(term in q for term in ("non-raspberry", "non raspberry", "not raspberry", "non-rpi", "non rpi")):
|
||||
non_rpi_sorted = sorted(non_rpi)
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Atlas has {len(non_rpi_sorted)} non‑Raspberry Pi nodes."
|
||||
if any(phrase in q for phrase in ("besides jetson", "excluding jetson", "without jetson", "non jetson")):
|
||||
amd = sorted(groups.get("amd64", []))
|
||||
return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson."
|
||||
return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found."
|
||||
return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi_sorted)}." if non_rpi_sorted else "No non‑Raspberry Pi nodes found."
|
||||
|
||||
if "jetson" in q:
|
||||
jets = groups.get("jetson", [])
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Atlas has {len(jets)} Jetson nodes."
|
||||
return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found."
|
||||
|
||||
if "amd64" in q or "x86" in q:
|
||||
amd = groups.get("amd64", [])
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Atlas has {len(amd)} amd64 nodes."
|
||||
return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found."
|
||||
|
||||
if "arm64" in q and "node" in q and any(word in q for word in ("how many", "count", "number")):
|
||||
count = sum(1 for node in inventory if node.get("arch") == "arm64")
|
||||
return f"Atlas has {count} arm64 nodes."
|
||||
|
||||
if "rpi4" in q:
|
||||
rpi4 = groups.get("rpi4", [])
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
@ -448,28 +512,52 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
|
||||
return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found."
|
||||
|
||||
if "raspberry" in q or "rpi" in q:
|
||||
rpi = sorted(set(groups.get("rpi4", [])) | set(groups.get("rpi5", [])))
|
||||
rpi = sorted(rpi_nodes)
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Atlas has {len(rpi)} Raspberry Pi nodes."
|
||||
return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found."
|
||||
|
||||
if "arm64-unknown" in q or "unknown" in q or "no hardware" in q:
|
||||
unknown = sorted(set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", [])))
|
||||
unknown = sorted(unknown_hw)
|
||||
return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels."
|
||||
|
||||
if "worker" in q and "node" in q:
|
||||
if any(word in q for word in ("missing", "expected", "should")):
|
||||
expected_workers = expected_worker_nodes_from_metrics()
|
||||
missing = sorted(set(expected_workers) - set(worker_ready + worker_not_ready)) if expected_workers else []
|
||||
if "missing" in q and missing:
|
||||
return "Missing worker nodes: " + ", ".join(missing) + "."
|
||||
if ("notready" in q or "not ready" in q or "unready" in q) and ("node" in q or "nodes" in q):
|
||||
return "Not ready nodes: " + (", ".join(not_ready) if not_ready else "none") + "."
|
||||
|
||||
if "worker" in q and ("node" in q or "nodes" in q or "workers" in q):
|
||||
not_ready_query = "not ready" in q or "unready" in q or "down" in q or ("not" in q and "ready" in q)
|
||||
if expected_workers:
|
||||
msg = f"Grafana inventory expects {len(expected_workers)} workers."
|
||||
if missing:
|
||||
msg += f" Missing: {', '.join(missing)}."
|
||||
if "missing" in q:
|
||||
return "Missing worker nodes: " + (", ".join(expected_missing) if expected_missing else "none") + "."
|
||||
if "ready" in q and ("not ready" in q or "vs" in q or "versus" in q):
|
||||
return (
|
||||
f"Expected workers: {len(expected_ready)} ready, "
|
||||
f"{len(expected_not_ready)} not ready (expected {len(expected_workers)})."
|
||||
)
|
||||
if any(word in q for word in ("how many", "count", "number")) and ("expect" in q or "expected" in q or "should" in q):
|
||||
msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
|
||||
if expected_missing:
|
||||
msg += f" Missing: {', '.join(expected_missing)}."
|
||||
return msg
|
||||
return "No expected worker inventory found; using live cluster state."
|
||||
if "not ready" in q or "unready" in q or "down" in q:
|
||||
if not_ready_query:
|
||||
if expected_not_ready or expected_missing:
|
||||
detail = []
|
||||
if expected_not_ready:
|
||||
detail.append(f"Not ready: {', '.join(expected_not_ready)}")
|
||||
if expected_missing:
|
||||
detail.append(f"Missing: {', '.join(expected_missing)}")
|
||||
return "Worker nodes needing attention. " + " ".join(detail) + "."
|
||||
return "All expected worker nodes are Ready."
|
||||
if any(word in q for word in ("expected", "expect", "should")):
|
||||
msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
|
||||
if expected_missing:
|
||||
msg += f" Missing: {', '.join(expected_missing)}."
|
||||
return msg
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Worker nodes: {len(expected_ready)} ready, {len(expected_not_ready)} not ready (expected {len(expected_workers)})."
|
||||
if "ready" in q:
|
||||
return f"Ready worker nodes ({len(expected_ready)}): {', '.join(expected_ready)}."
|
||||
if not_ready_query:
|
||||
return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "."
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready."
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user