atlasbot: replace targeted handlers with generic planner
This commit is contained in:
parent
6c413d4a50
commit
37a203509b
@ -95,11 +95,29 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
|
||||
TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
|
||||
TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
|
||||
_DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
|
||||
HOTTEST_QUERIES = {
|
||||
"cpu": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||
"ram": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||
"net": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||
"io": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||
|
||||
OPERATION_HINTS = {
|
||||
"count": ("how many", "count", "number", "total"),
|
||||
"list": ("list", "which", "what are", "show", "names"),
|
||||
"top": ("top", "hottest", "highest", "most", "largest", "max", "maximum"),
|
||||
"status": ("ready", "not ready", "unready", "down", "missing", "status"),
|
||||
}
|
||||
|
||||
METRIC_HINTS = {
|
||||
"cpu": ("cpu",),
|
||||
"ram": ("ram", "memory", "mem"),
|
||||
"net": ("net", "network", "bandwidth", "throughput"),
|
||||
"io": ("io", "disk", "storage"),
|
||||
"connections": ("connections", "conn", "postgres", "database", "db"),
|
||||
}
|
||||
|
||||
HARDWARE_HINTS = {
|
||||
"amd64": ("amd64", "x86", "x86_64", "x86-64"),
|
||||
"jetson": ("jetson",),
|
||||
"rpi4": ("rpi4",),
|
||||
"rpi5": ("rpi5",),
|
||||
"rpi": ("rpi", "raspberry"),
|
||||
"arm64": ("arm64", "aarch64"),
|
||||
}
|
||||
|
||||
def normalize_query(text: str) -> str:
|
||||
@ -312,63 +330,127 @@ def _humanize_rate(value: str, *, unit: str) -> str:
|
||||
return f"{val / 1024:.2f} KB/s"
|
||||
return f"{val:.2f} B/s"
|
||||
|
||||
def _hottest_query(metric: str, node_regex: str | None) -> str:
|
||||
expr = HOTTEST_QUERIES[metric]
|
||||
if node_regex:
|
||||
def _has_any(text: str, phrases: tuple[str, ...]) -> bool:
|
||||
return any(p in text for p in phrases)
|
||||
|
||||
def _detect_operation(q: str) -> str | None:
|
||||
for op, phrases in OPERATION_HINTS.items():
|
||||
if _has_any(q, phrases):
|
||||
return op
|
||||
return None
|
||||
|
||||
def _detect_metric(q: str) -> str | None:
|
||||
for metric, phrases in METRIC_HINTS.items():
|
||||
if _has_any(q, phrases):
|
||||
return metric
|
||||
return None
|
||||
|
||||
def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]:
|
||||
include: set[str] = set()
|
||||
exclude: set[str] = set()
|
||||
for hardware, phrases in HARDWARE_HINTS.items():
|
||||
for phrase in phrases:
|
||||
if f"non {phrase}" in q or f"non-{phrase}" in q or f"not {phrase}" in q:
|
||||
exclude.add(hardware)
|
||||
elif phrase in q:
|
||||
include.add(hardware)
|
||||
return include, exclude
|
||||
|
||||
def _detect_entity(q: str) -> str | None:
|
||||
if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q):
|
||||
return "node"
|
||||
if "pod" in q or "pods" in q:
|
||||
return "pod"
|
||||
if "namespace" in q or "namespaces" in q:
|
||||
return "namespace"
|
||||
return None
|
||||
|
||||
def _metric_entry_score(entry: dict[str, Any], tokens: list[str], *, metric: str | None, op: str | None) -> int:
|
||||
hay = _metric_tokens(entry)
|
||||
score = 0
|
||||
for t in set(tokens):
|
||||
if t in hay:
|
||||
score += 2 if t in (entry.get("panel_title") or "").lower() else 1
|
||||
if metric:
|
||||
for phrase in METRIC_HINTS.get(metric, (metric,)):
|
||||
if phrase in hay:
|
||||
score += 3
|
||||
if op == "top" and ("hottest" in hay or "top" in hay):
|
||||
score += 3
|
||||
if "node" in hay:
|
||||
score += 1
|
||||
return score
|
||||
|
||||
def _select_metric_entry(tokens: list[str], *, metric: str | None, op: str | None) -> dict[str, Any] | None:
|
||||
scored: list[tuple[int, dict[str, Any]]] = []
|
||||
for entry in _METRIC_INDEX:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
score = _metric_entry_score(entry, tokens, metric=metric, op=op)
|
||||
if score:
|
||||
scored.append((score, entry))
|
||||
if not scored:
|
||||
return None
|
||||
scored.sort(key=lambda item: item[0], reverse=True)
|
||||
return scored[0][1]
|
||||
|
||||
def _apply_node_filter(expr: str, node_regex: str | None) -> str:
|
||||
if not node_regex:
|
||||
return expr
|
||||
needle = 'node_uname_info{nodename!=""}'
|
||||
replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}'
|
||||
return expr.replace(needle, replacement)
|
||||
return expr
|
||||
|
||||
def _vm_hottest(metric: str, node_regex: str | None) -> tuple[str, str] | None:
|
||||
expr = _hottest_query(metric, node_regex)
|
||||
res = vm_query(expr)
|
||||
def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str:
|
||||
series = _vm_value_series(res)
|
||||
panel = entry.get("panel_title") or "Metric"
|
||||
if not series:
|
||||
return None
|
||||
first = series[0]
|
||||
labels = first.get("metric") or {}
|
||||
value = first.get("value") or []
|
||||
val = value[1] if isinstance(value, list) and len(value) > 1 else ""
|
||||
node = labels.get("node") or labels.get("__name__") or ""
|
||||
if not node:
|
||||
return None
|
||||
return (str(node), str(val))
|
||||
|
||||
def _hottest_answer(q: str, *, nodes: list[str] | None) -> str:
|
||||
metric = None
|
||||
assumed_cpu = False
|
||||
if "cpu" in q:
|
||||
metric = "cpu"
|
||||
elif "ram" in q or "memory" in q:
|
||||
metric = "ram"
|
||||
elif "net" in q or "network" in q:
|
||||
metric = "net"
|
||||
elif "io" in q or "disk" in q or "storage" in q:
|
||||
metric = "io"
|
||||
if metric is None:
|
||||
metric = "cpu"
|
||||
assumed_cpu = True
|
||||
if nodes is not None and not nodes:
|
||||
return "No nodes match the requested hardware class."
|
||||
|
||||
node_regex = "|".join(nodes) if nodes else None
|
||||
metrics = [metric]
|
||||
lines: list[str] = []
|
||||
for m in metrics:
|
||||
picked = _vm_hottest(m, node_regex)
|
||||
if not picked:
|
||||
continue
|
||||
node, val = picked
|
||||
unit = "%" if m in ("cpu", "ram") else "B/s"
|
||||
val_str = _humanize_rate(val, unit=unit)
|
||||
label = {"cpu": "CPU", "ram": "RAM", "net": "NET", "io": "I/O"}[m]
|
||||
lines.append(f"{label}: {node} ({val_str})")
|
||||
if not lines:
|
||||
return ""
|
||||
label = metric.upper()
|
||||
suffix = " (defaulting to CPU)" if assumed_cpu else ""
|
||||
return f"Hottest node by {label}: {lines[0].split(': ', 1)[1]}.{suffix}"
|
||||
rendered = vm_render_result(res, limit=5)
|
||||
if not rendered:
|
||||
return ""
|
||||
lines = [line.lstrip("-").strip() for line in rendered.splitlines() if line.strip().startswith("-")]
|
||||
if len(lines) == 1:
|
||||
return f"{panel}: {lines[0]}."
|
||||
return f"{panel}:\n" + "\n".join(f"- {line}" for line in lines)
|
||||
|
||||
def _inventory_filter(
|
||||
inventory: list[dict[str, Any]],
|
||||
*,
|
||||
include_hw: set[str],
|
||||
exclude_hw: set[str],
|
||||
only_workers: bool,
|
||||
only_ready: bool | None,
|
||||
nodes_in_query: list[str],
|
||||
) -> list[dict[str, Any]]:
|
||||
results = inventory
|
||||
if nodes_in_query:
|
||||
results = [node for node in results if node.get("name") in nodes_in_query]
|
||||
if only_workers:
|
||||
results = [node for node in results if node.get("is_worker") is True]
|
||||
if only_ready is True:
|
||||
results = [node for node in results if node.get("ready") is True]
|
||||
if only_ready is False:
|
||||
results = [node for node in results if node.get("ready") is False]
|
||||
if include_hw:
|
||||
results = [node for node in results if _hardware_match(node, include_hw)]
|
||||
if exclude_hw:
|
||||
results = [node for node in results if not _hardware_match(node, exclude_hw)]
|
||||
return results
|
||||
|
||||
def _hardware_match(node: dict[str, Any], filters: set[str]) -> bool:
|
||||
hw = node.get("hardware") or ""
|
||||
arch = node.get("arch") or ""
|
||||
for f in filters:
|
||||
if f == "rpi" and hw in ("rpi4", "rpi5"):
|
||||
return True
|
||||
if f == "arm64" and arch == "arm64":
|
||||
return True
|
||||
if hw == f:
|
||||
return True
|
||||
if f == "amd64" and arch == "amd64":
|
||||
return True
|
||||
return False
|
||||
|
||||
def _node_roles(labels: dict[str, Any]) -> list[str]:
|
||||
roles: list[str] = []
|
||||
@ -495,176 +577,103 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
|
||||
def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str:
|
||||
q = normalize_query(prompt)
|
||||
if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")):
|
||||
return metrics_summary
|
||||
|
||||
if not inventory:
|
||||
if not q:
|
||||
return ""
|
||||
|
||||
sets = _inventory_sets(inventory)
|
||||
names = sets["names"]
|
||||
ready = sets["ready"]
|
||||
not_ready = sets["not_ready"]
|
||||
groups = sets["groups"]
|
||||
worker_names = sets["worker_names"]
|
||||
worker_ready = sets["worker_ready"]
|
||||
worker_not_ready = sets["worker_not_ready"]
|
||||
expected_workers = sets["expected_workers"]
|
||||
expected_ready = sets["expected_ready"]
|
||||
expected_not_ready = sets["expected_not_ready"]
|
||||
expected_missing = sets["expected_missing"]
|
||||
total = len(names)
|
||||
tokens = _tokens(q)
|
||||
op = _detect_operation(q)
|
||||
metric = _detect_metric(q)
|
||||
entity = _detect_entity(q)
|
||||
include_hw, exclude_hw = _detect_hardware_filters(q)
|
||||
nodes_in_query = _extract_titan_nodes(q)
|
||||
rpi_nodes = set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))
|
||||
non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", []))
|
||||
unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))
|
||||
only_workers = "worker" in q or "workers" in q
|
||||
only_ready: bool | None = None
|
||||
if "not ready" in q or "unready" in q or "down" in q or "missing" in q:
|
||||
only_ready = False
|
||||
elif "ready" in q:
|
||||
only_ready = True
|
||||
|
||||
if "hottest" in q or "hot" in q:
|
||||
filter_nodes: list[str] | None = None
|
||||
if "amd64" in q or "x86" in q:
|
||||
filter_nodes = sorted(groups.get("amd64", []))
|
||||
elif "jetson" in q:
|
||||
filter_nodes = sorted(groups.get("jetson", []))
|
||||
elif "raspberry" in q or "rpi" in q:
|
||||
filter_nodes = sorted(rpi_nodes)
|
||||
elif "arm64" in q:
|
||||
filter_nodes = sorted([n for n in names if n not in groups.get("amd64", [])])
|
||||
hottest = _hottest_answer(q, nodes=filter_nodes)
|
||||
if hottest:
|
||||
return hottest
|
||||
return "Unable to determine hottest nodes right now (metrics unavailable)."
|
||||
if entity == "node" and only_ready is not None and op != "count":
|
||||
op = "status"
|
||||
|
||||
if nodes_in_query and ("raspberry" in q or "rpi" in q):
|
||||
parts: list[str] = []
|
||||
for node in nodes_in_query:
|
||||
if node in rpi_nodes:
|
||||
parts.append(f"{node} is a Raspberry Pi node.")
|
||||
elif node in non_rpi:
|
||||
parts.append(f"{node} is not a Raspberry Pi node.")
|
||||
elif node in names:
|
||||
parts.append(f"{node} is in Atlas but hardware is unknown.")
|
||||
else:
|
||||
parts.append(f"{node} is not in the Atlas cluster.")
|
||||
return " ".join(parts)
|
||||
if not op and entity == "node":
|
||||
op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count"
|
||||
|
||||
if nodes_in_query and "jetson" in q:
|
||||
jets = set(groups.get("jetson", []))
|
||||
parts = []
|
||||
for node in nodes_in_query:
|
||||
if node in jets:
|
||||
parts.append(f"{node} is a Jetson node.")
|
||||
elif node in names:
|
||||
parts.append(f"{node} is not a Jetson node.")
|
||||
else:
|
||||
parts.append(f"{node} is not in the Atlas cluster.")
|
||||
return " ".join(parts)
|
||||
if op == "top" and metric is None:
|
||||
metric = "cpu"
|
||||
|
||||
if nodes_in_query and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q or "present" in q or "exist" in q):
|
||||
parts: list[str] = []
|
||||
for node in nodes_in_query:
|
||||
if node in names:
|
||||
parts.append(f"Yes. {node} is in the Atlas cluster.")
|
||||
else:
|
||||
parts.append(f"No. {node} is not in the Atlas cluster.")
|
||||
return " ".join(parts)
|
||||
|
||||
if any(term in q for term in ("non-raspberry", "non raspberry", "not raspberry", "non-rpi", "non rpi")):
|
||||
non_rpi_sorted = sorted(non_rpi)
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Atlas has {len(non_rpi_sorted)} non‑Raspberry Pi nodes."
|
||||
if any(phrase in q for phrase in ("besides jetson", "excluding jetson", "without jetson", "non jetson")):
|
||||
amd = sorted(groups.get("amd64", []))
|
||||
return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson."
|
||||
return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi_sorted)}." if non_rpi_sorted else "No non‑Raspberry Pi nodes found."
|
||||
|
||||
if "jetson" in q:
|
||||
jets = groups.get("jetson", [])
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Atlas has {len(jets)} Jetson nodes."
|
||||
return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found."
|
||||
|
||||
if "amd64" in q or "x86" in q:
|
||||
amd = groups.get("amd64", [])
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Atlas has {len(amd)} amd64 nodes."
|
||||
return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found."
|
||||
|
||||
if "arm64" in q and "node" in q and any(word in q for word in ("how many", "count", "number")):
|
||||
count = sum(1 for node in inventory if node.get("arch") == "arm64")
|
||||
return f"Atlas has {count} arm64 nodes."
|
||||
|
||||
if "rpi4" in q:
|
||||
rpi4 = groups.get("rpi4", [])
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Atlas has {len(rpi4)} rpi4 nodes."
|
||||
return f"rpi4 nodes: {', '.join(rpi4)}." if rpi4 else "No rpi4 nodes found."
|
||||
|
||||
if "rpi5" in q:
|
||||
rpi5 = groups.get("rpi5", [])
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Atlas has {len(rpi5)} rpi5 nodes."
|
||||
return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found."
|
||||
|
||||
if "raspberry" in q or "rpi" in q:
|
||||
rpi = sorted(rpi_nodes)
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Atlas has {len(rpi)} Raspberry Pi nodes."
|
||||
return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found."
|
||||
|
||||
if "arm64-unknown" in q or "unknown" in q or "no hardware" in q:
|
||||
unknown = sorted(unknown_hw)
|
||||
return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels."
|
||||
|
||||
if ("notready" in q or "not ready" in q or "unready" in q) and ("node" in q or "nodes" in q):
|
||||
return "Not ready nodes: " + (", ".join(not_ready) if not_ready else "none") + "."
|
||||
|
||||
if "worker" in q and ("node" in q or "nodes" in q or "workers" in q):
|
||||
not_ready_query = "not ready" in q or "unready" in q or "down" in q or ("not" in q and "ready" in q)
|
||||
if expected_workers:
|
||||
if "missing" in q:
|
||||
return "Missing worker nodes: " + (", ".join(expected_missing) if expected_missing else "none") + "."
|
||||
if "ready" in q and ("not ready" in q or "vs" in q or "versus" in q):
|
||||
return (
|
||||
f"Expected workers: {len(expected_ready)} ready, "
|
||||
f"{len(expected_not_ready)} not ready (expected {len(expected_workers)})."
|
||||
# Metrics-first when a metric or top operation is requested.
|
||||
if metric or op == "top":
|
||||
entry = _select_metric_entry(tokens, metric=metric, op=op)
|
||||
if entry and isinstance(entry.get("exprs"), list) and entry["exprs"]:
|
||||
expr = entry["exprs"][0]
|
||||
if inventory:
|
||||
scoped = _inventory_filter(
|
||||
inventory,
|
||||
include_hw=include_hw,
|
||||
exclude_hw=exclude_hw,
|
||||
only_workers=only_workers,
|
||||
only_ready=None,
|
||||
nodes_in_query=nodes_in_query,
|
||||
)
|
||||
if any(word in q for word in ("how many", "count", "number")) and ("expect" in q or "expected" in q or "should" in q):
|
||||
if scoped:
|
||||
node_regex = "|".join([n["name"] for n in scoped])
|
||||
expr = _apply_node_filter(expr, node_regex)
|
||||
res = vm_query(expr, timeout=20)
|
||||
answer = _format_metric_answer(entry, res)
|
||||
if answer:
|
||||
return answer
|
||||
if metrics_summary:
|
||||
return metrics_summary
|
||||
|
||||
if entity != "node" or not inventory:
|
||||
if any(word in q for word in METRIC_HINT_WORDS) and not metrics_summary:
|
||||
return "I don't have data to answer that right now."
|
||||
return ""
|
||||
|
||||
expected_workers = expected_worker_nodes_from_metrics()
|
||||
filtered = _inventory_filter(
|
||||
inventory,
|
||||
include_hw=include_hw,
|
||||
exclude_hw=exclude_hw,
|
||||
only_workers=only_workers,
|
||||
only_ready=only_ready if op in ("status", "count") else None,
|
||||
nodes_in_query=nodes_in_query,
|
||||
)
|
||||
names = [node["name"] for node in filtered]
|
||||
|
||||
if op == "status":
|
||||
if "missing" in q and expected_workers:
|
||||
missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
|
||||
return "Missing nodes: " + (", ".join(missing) if missing else "none") + "."
|
||||
if only_ready is False:
|
||||
return "Not ready nodes: " + (", ".join(names) if names else "none") + "."
|
||||
if only_ready is True:
|
||||
return f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + "."
|
||||
|
||||
if op == "count":
|
||||
if expected_workers and ("expected" in q or "should" in q):
|
||||
missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
|
||||
msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
|
||||
if expected_missing:
|
||||
msg += f" Missing: {', '.join(expected_missing)}."
|
||||
if missing:
|
||||
msg += f" Missing: {', '.join(missing)}."
|
||||
return msg
|
||||
if not_ready_query:
|
||||
if expected_not_ready or expected_missing:
|
||||
detail = []
|
||||
if expected_not_ready:
|
||||
detail.append(f"Not ready: {', '.join(expected_not_ready)}")
|
||||
if expected_missing:
|
||||
detail.append(f"Missing: {', '.join(expected_missing)}")
|
||||
return "Worker nodes needing attention. " + " ".join(detail) + "."
|
||||
return "All expected worker nodes are Ready."
|
||||
if any(word in q for word in ("expected", "expect", "should")):
|
||||
msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
|
||||
if expected_missing:
|
||||
msg += f" Missing: {', '.join(expected_missing)}."
|
||||
return msg
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Worker nodes: {len(expected_ready)} ready, {len(expected_not_ready)} not ready (expected {len(expected_workers)})."
|
||||
if "ready" in q:
|
||||
return f"Ready worker nodes ({len(expected_ready)}): {', '.join(expected_ready)}."
|
||||
if not_ready_query:
|
||||
return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "."
|
||||
if any(word in q for word in ("how many", "count", "number")):
|
||||
return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready."
|
||||
return "Ready worker nodes ({}): {}.".format(len(worker_ready), ", ".join(worker_ready))
|
||||
if not (include_hw or exclude_hw or nodes_in_query or only_workers):
|
||||
return f"Atlas has {len(names)} nodes."
|
||||
return f"Matching nodes: {len(names)}."
|
||||
|
||||
if any(word in q for word in ("how many", "count", "number")) and "node" in q:
|
||||
return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready."
|
||||
|
||||
if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q:
|
||||
return "Atlas node names: " + ", ".join(names) + "."
|
||||
|
||||
if "ready" in q and "node" in q:
|
||||
return f"Ready nodes ({len(ready)}): {', '.join(ready)}."
|
||||
if op == "list":
|
||||
if nodes_in_query:
|
||||
parts = []
|
||||
existing = {n["name"] for n in inventory}
|
||||
for node in nodes_in_query:
|
||||
parts.append(f"{node}: {'present' if node in existing else 'not present'}")
|
||||
return "Node presence: " + ", ".join(parts) + "."
|
||||
if not names:
|
||||
return "Matching nodes: none."
|
||||
shown = names[:30]
|
||||
suffix = f", … (+{len(names) - 30} more)" if len(names) > 30 else ""
|
||||
return "Matching nodes: " + ", ".join(shown) + suffix + "."
|
||||
|
||||
return ""
|
||||
|
||||
@ -727,25 +736,6 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
|
||||
fallback = _metrics_fallback_summary(panel, summary)
|
||||
return context, fallback
|
||||
|
||||
def jetson_nodes_from_kb() -> list[str]:
|
||||
for doc in KB.get("runbooks", []):
|
||||
if not isinstance(doc, dict):
|
||||
continue
|
||||
body = str(doc.get("body") or "")
|
||||
for line in body.splitlines():
|
||||
if "jetson" not in line.lower():
|
||||
continue
|
||||
names = _extract_titan_nodes(line)
|
||||
if names:
|
||||
return names
|
||||
return []
|
||||
|
||||
def jetson_nodes_summary(cluster_name: str) -> str:
|
||||
names = jetson_nodes_from_kb()
|
||||
if names:
|
||||
return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}."
|
||||
return ""
|
||||
|
||||
def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]:
|
||||
q = (query or "").strip()
|
||||
if not q or not KB.get("catalog"):
|
||||
@ -953,22 +943,16 @@ def _parse_metric_lines(summary: str) -> dict[str, str]:
|
||||
def _metrics_fallback_summary(panel: str, summary: str) -> str:
|
||||
parsed = _parse_metric_lines(summary)
|
||||
panel_l = (panel or "").lower()
|
||||
if panel_l.startswith("postgres connections"):
|
||||
used = parsed.get("conn=used")
|
||||
maxv = parsed.get("conn=max")
|
||||
if used and maxv:
|
||||
try:
|
||||
used_i = int(float(used))
|
||||
max_i = int(float(maxv))
|
||||
except ValueError:
|
||||
return f"Postgres connections: {summary}"
|
||||
free = max_i - used_i
|
||||
return f"Postgres connections: {used_i}/{max_i} used ({free} free)."
|
||||
if panel_l.startswith("postgres hottest"):
|
||||
if parsed:
|
||||
label, value = next(iter(parsed.items()))
|
||||
return f"Most Postgres connections: {label} = {value}."
|
||||
items = list(parsed.items())
|
||||
if len(items) == 1:
|
||||
label, value = items[0]
|
||||
return f"{panel}: {label} = {value}."
|
||||
compact = "; ".join(f"{k}={v}" for k, v in items)
|
||||
return f"{panel}: {compact}."
|
||||
if panel_l:
|
||||
return f"{panel}: {summary}"
|
||||
return summary
|
||||
|
||||
def _node_ready_status(node: dict) -> bool | None:
|
||||
conditions = node.get("status", {}).get("conditions") or []
|
||||
@ -1075,93 +1059,6 @@ def vm_cluster_snapshot() -> str:
|
||||
parts.append(pr)
|
||||
return "\n".join(parts).strip()
|
||||
|
||||
def nodes_summary(cluster_name: str) -> str:
|
||||
state = _ariadne_state()
|
||||
if state:
|
||||
nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {}
|
||||
total = nodes.get("total")
|
||||
ready = nodes.get("ready")
|
||||
not_ready = nodes.get("not_ready")
|
||||
if isinstance(total, int) and isinstance(ready, int):
|
||||
not_ready = not_ready if isinstance(not_ready, int) else max(total - ready, 0)
|
||||
if not_ready:
|
||||
return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady."
|
||||
return f"{cluster_name} cluster has {total} nodes, all Ready."
|
||||
try:
|
||||
data = k8s_get("/api/v1/nodes?limit=500")
|
||||
except Exception:
|
||||
return ""
|
||||
items = data.get("items") or []
|
||||
if not isinstance(items, list) or not items:
|
||||
return ""
|
||||
total = len(items)
|
||||
ready = 0
|
||||
for node in items:
|
||||
conditions = node.get("status", {}).get("conditions") or []
|
||||
for cond in conditions if isinstance(conditions, list) else []:
|
||||
if cond.get("type") == "Ready":
|
||||
if cond.get("status") == "True":
|
||||
ready += 1
|
||||
break
|
||||
not_ready = max(total - ready, 0)
|
||||
if not_ready:
|
||||
return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady."
|
||||
return f"{cluster_name} cluster has {total} nodes, all Ready."
|
||||
|
||||
def nodes_names_summary(cluster_name: str) -> str:
|
||||
state = _ariadne_state()
|
||||
if state:
|
||||
nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {}
|
||||
names = nodes.get("names")
|
||||
if isinstance(names, list) and names:
|
||||
cleaned = sorted({str(n) for n in names if n})
|
||||
if len(cleaned) <= 30:
|
||||
return f"{cluster_name} node names: {', '.join(cleaned)}."
|
||||
shown = ", ".join(cleaned[:30])
|
||||
return f"{cluster_name} node names: {shown}, … (+{len(cleaned) - 30} more)."
|
||||
try:
|
||||
data = k8s_get("/api/v1/nodes?limit=500")
|
||||
except Exception:
|
||||
return ""
|
||||
items = data.get("items") or []
|
||||
if not isinstance(items, list) or not items:
|
||||
return ""
|
||||
names = []
|
||||
for node in items:
|
||||
name = (node.get("metadata") or {}).get("name") or ""
|
||||
if name:
|
||||
names.append(name)
|
||||
names = sorted(set(names))
|
||||
if not names:
|
||||
return ""
|
||||
if len(names) <= 30:
|
||||
return f"{cluster_name} node names: {', '.join(names)}."
|
||||
shown = ", ".join(names[:30])
|
||||
return f"{cluster_name} node names: {shown}, … (+{len(names) - 30} more)."
|
||||
|
||||
|
||||
def nodes_arch_summary(cluster_name: str, arch: str) -> str:
|
||||
try:
|
||||
data = k8s_get("/api/v1/nodes?limit=500")
|
||||
except Exception:
|
||||
return ""
|
||||
items = data.get("items") or []
|
||||
if not isinstance(items, list) or not items:
|
||||
return ""
|
||||
normalized = (arch or "").strip().lower()
|
||||
if normalized in ("aarch64", "arm64"):
|
||||
arch_label = "arm64"
|
||||
elif normalized in ("x86_64", "x86-64", "amd64"):
|
||||
arch_label = "amd64"
|
||||
else:
|
||||
arch_label = normalized
|
||||
total = 0
|
||||
for node in items:
|
||||
labels = (node.get("metadata") or {}).get("labels") or {}
|
||||
if labels.get("kubernetes.io/arch") == arch_label:
|
||||
total += 1
|
||||
return f"{cluster_name} cluster has {total} {arch_label} nodes."
|
||||
|
||||
def _strip_code_fence(text: str) -> str:
|
||||
cleaned = (text or "").strip()
|
||||
match = CODE_FENCE_RE.match(cleaned)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user