atlasbot: replace targeted handlers with generic planner

2026-01-26 22:38:18 -03:00 · 2026-01-26 22:38:18 -03:00 · 37a203509b
commit 37a203509b
parent 6c413d4a50
1 changed files with 235 additions and 338 deletions
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -95,11 +95,29 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
 TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
 TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
 _DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
-HOTTEST_QUERIES = {
+
-    "cpu": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+OPERATION_HINTS = {
-    "ram": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+    "count": ("how many", "count", "number", "total"),
-    "net": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+    "list": ("list", "which", "what are", "show", "names"),
-    "io": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+    "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum"),
    "status": ("ready", "not ready", "unready", "down", "missing", "status"),
 }
 METRIC_HINTS = {
    "cpu": ("cpu",),
    "ram": ("ram", "memory", "mem"),
    "net": ("net", "network", "bandwidth", "throughput"),
    "io": ("io", "disk", "storage"),
    "connections": ("connections", "conn", "postgres", "database", "db"),
 }
 HARDWARE_HINTS = {
    "amd64": ("amd64", "x86", "x86_64", "x86-64"),
    "jetson": ("jetson",),
    "rpi4": ("rpi4",),
    "rpi5": ("rpi5",),
    "rpi": ("rpi", "raspberry"),
    "arm64": ("arm64", "aarch64"),
 }
 def normalize_query(text: str) -> str:
@ -312,63 +330,127 @@ def _humanize_rate(value: str, *, unit: str) -> str:
        return f"{val / 1024:.2f} KB/s"
    return f"{val:.2f} B/s"
-def _hottest_query(metric: str, node_regex: str | None) -> str:
+def _has_any(text: str, phrases: tuple[str, ...]) -> bool:
-    expr = HOTTEST_QUERIES[metric]
+    return any(p in text for p in phrases)
    if node_regex:
        needle = 'node_uname_info{nodename!=""}'
        replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}'
        return expr.replace(needle, replacement)
    return expr
-def _vm_hottest(metric: str, node_regex: str | None) -> tuple[str, str] | None:
+def _detect_operation(q: str) -> str | None:
-    expr = _hottest_query(metric, node_regex)
+    for op, phrases in OPERATION_HINTS.items():
-    res = vm_query(expr)
+        if _has_any(q, phrases):
-    series = _vm_value_series(res)
+            return op
-    if not series:
+    return None
        return None
    first = series[0]
    labels = first.get("metric") or {}
    value = first.get("value") or []
    val = value[1] if isinstance(value, list) and len(value) > 1 else ""
    node = labels.get("node") or labels.get("__name__") or ""
    if not node:
        return None
    return (str(node), str(val))
-def _hottest_answer(q: str, *, nodes: list[str] | None) -> str:
+def _detect_metric(q: str) -> str | None:
-    metric = None
+    for metric, phrases in METRIC_HINTS.items():
-    assumed_cpu = False
+        if _has_any(q, phrases):
-    if "cpu" in q:
+            return metric
-        metric = "cpu"
+    return None
    elif "ram" in q or "memory" in q:
        metric = "ram"
    elif "net" in q or "network" in q:
        metric = "net"
    elif "io" in q or "disk" in q or "storage" in q:
        metric = "io"
    if metric is None:
        metric = "cpu"
        assumed_cpu = True
    if nodes is not None and not nodes:
        return "No nodes match the requested hardware class."
-    node_regex = "|".join(nodes) if nodes else None
+def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]:
-    metrics = [metric]
+    include: set[str] = set()
-    lines: list[str] = []
+    exclude: set[str] = set()
-    for m in metrics:
+    for hardware, phrases in HARDWARE_HINTS.items():
-        picked = _vm_hottest(m, node_regex)
+        for phrase in phrases:
-        if not picked:
+            if f"non {phrase}" in q or f"non-{phrase}" in q or f"not {phrase}" in q:
                exclude.add(hardware)
            elif phrase in q:
                include.add(hardware)
    return include, exclude
 def _detect_entity(q: str) -> str | None:
    if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q):
        return "node"
    if "pod" in q or "pods" in q:
        return "pod"
    if "namespace" in q or "namespaces" in q:
        return "namespace"
    return None
 def _metric_entry_score(entry: dict[str, Any], tokens: list[str], *, metric: str | None, op: str | None) -> int:
    hay = _metric_tokens(entry)
    score = 0
    for t in set(tokens):
        if t in hay:
            score += 2 if t in (entry.get("panel_title") or "").lower() else 1
    if metric:
        for phrase in METRIC_HINTS.get(metric, (metric,)):
            if phrase in hay:
                score += 3
    if op == "top" and ("hottest" in hay or "top" in hay):
        score += 3
    if "node" in hay:
        score += 1
    return score
 def _select_metric_entry(tokens: list[str], *, metric: str | None, op: str | None) -> dict[str, Any] | None:
    scored: list[tuple[int, dict[str, Any]]] = []
    for entry in _METRIC_INDEX:
        if not isinstance(entry, dict):
            continue
-        node, val = picked
+        score = _metric_entry_score(entry, tokens, metric=metric, op=op)
-        unit = "%" if m in ("cpu", "ram") else "B/s"
+        if score:
-        val_str = _humanize_rate(val, unit=unit)
+            scored.append((score, entry))
-        label = {"cpu": "CPU", "ram": "RAM", "net": "NET", "io": "I/O"}[m]
+    if not scored:
-        lines.append(f"{label}: {node} ({val_str})")
+        return None
-    if not lines:
+    scored.sort(key=lambda item: item[0], reverse=True)
    return scored[0][1]
 def _apply_node_filter(expr: str, node_regex: str | None) -> str:
    if not node_regex:
        return expr
    needle = 'node_uname_info{nodename!=""}'
    replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}'
    return expr.replace(needle, replacement)
 def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str:
    series = _vm_value_series(res)
    panel = entry.get("panel_title") or "Metric"
    if not series:
        return ""
-    label = metric.upper()
+    rendered = vm_render_result(res, limit=5)
-    suffix = " (defaulting to CPU)" if assumed_cpu else ""
+    if not rendered:
-    return f"Hottest node by {label}: {lines[0].split(': ', 1)[1]}.{suffix}"
+        return ""
    lines = [line.lstrip("-").strip() for line in rendered.splitlines() if line.strip().startswith("-")]
    if len(lines) == 1:
        return f"{panel}: {lines[0]}."
    return f"{panel}:\n" + "\n".join(f"- {line}" for line in lines)
 def _inventory_filter(
    inventory: list[dict[str, Any]],
    *,
    include_hw: set[str],
    exclude_hw: set[str],
    only_workers: bool,
    only_ready: bool | None,
    nodes_in_query: list[str],
 ) -> list[dict[str, Any]]:
    results = inventory
    if nodes_in_query:
        results = [node for node in results if node.get("name") in nodes_in_query]
    if only_workers:
        results = [node for node in results if node.get("is_worker") is True]
    if only_ready is True:
        results = [node for node in results if node.get("ready") is True]
    if only_ready is False:
        results = [node for node in results if node.get("ready") is False]
    if include_hw:
        results = [node for node in results if _hardware_match(node, include_hw)]
    if exclude_hw:
        results = [node for node in results if not _hardware_match(node, exclude_hw)]
    return results
 def _hardware_match(node: dict[str, Any], filters: set[str]) -> bool:
    hw = node.get("hardware") or ""
    arch = node.get("arch") or ""
    for f in filters:
        if f == "rpi" and hw in ("rpi4", "rpi5"):
            return True
        if f == "arm64" and arch == "arm64":
            return True
        if hw == f:
            return True
        if f == "amd64" and arch == "amd64":
            return True
    return False
 def _node_roles(labels: dict[str, Any]) -> list[str]:
    roles: list[str] = []
@ -495,176 +577,103 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
 def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str:
    q = normalize_query(prompt)
-    if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")):
+    if not q:
        return metrics_summary
    if not inventory:
        return ""
-    sets = _inventory_sets(inventory)
+    tokens = _tokens(q)
-    names = sets["names"]
+    op = _detect_operation(q)
-    ready = sets["ready"]
+    metric = _detect_metric(q)
-    not_ready = sets["not_ready"]
+    entity = _detect_entity(q)
-    groups = sets["groups"]
+    include_hw, exclude_hw = _detect_hardware_filters(q)
    worker_names = sets["worker_names"]
    worker_ready = sets["worker_ready"]
    worker_not_ready = sets["worker_not_ready"]
    expected_workers = sets["expected_workers"]
    expected_ready = sets["expected_ready"]
    expected_not_ready = sets["expected_not_ready"]
    expected_missing = sets["expected_missing"]
    total = len(names)
    nodes_in_query = _extract_titan_nodes(q)
-    rpi_nodes = set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))
+    only_workers = "worker" in q or "workers" in q
-    non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", []))
+    only_ready: bool | None = None
-    unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))
+    if "not ready" in q or "unready" in q or "down" in q or "missing" in q:
        only_ready = False
    elif "ready" in q:
        only_ready = True
-    if "hottest" in q or "hot" in q:
+    if entity == "node" and only_ready is not None and op != "count":
-        filter_nodes: list[str] | None = None
+        op = "status"
        if "amd64" in q or "x86" in q:
            filter_nodes = sorted(groups.get("amd64", []))
        elif "jetson" in q:
            filter_nodes = sorted(groups.get("jetson", []))
        elif "raspberry" in q or "rpi" in q:
            filter_nodes = sorted(rpi_nodes)
        elif "arm64" in q:
            filter_nodes = sorted([n for n in names if n not in groups.get("amd64", [])])
        hottest = _hottest_answer(q, nodes=filter_nodes)
        if hottest:
            return hottest
        return "Unable to determine hottest nodes right now (metrics unavailable)."
-    if nodes_in_query and ("raspberry" in q or "rpi" in q):
+    if not op and entity == "node":
-        parts: list[str] = []
+        op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count"
        for node in nodes_in_query:
            if node in rpi_nodes:
                parts.append(f"{node} is a Raspberry Pi node.")
            elif node in non_rpi:
                parts.append(f"{node} is not a Raspberry Pi node.")
            elif node in names:
                parts.append(f"{node} is in Atlas but hardware is unknown.")
            else:
                parts.append(f"{node} is not in the Atlas cluster.")
        return " ".join(parts)
-    if nodes_in_query and "jetson" in q:
+    if op == "top" and metric is None:
-        jets = set(groups.get("jetson", []))
+        metric = "cpu"
        parts = []
        for node in nodes_in_query:
            if node in jets:
                parts.append(f"{node} is a Jetson node.")
            elif node in names:
                parts.append(f"{node} is not a Jetson node.")
            else:
                parts.append(f"{node} is not in the Atlas cluster.")
        return " ".join(parts)
-    if nodes_in_query and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q or "present" in q or "exist" in q):
+    # Metrics-first when a metric or top operation is requested.
-        parts: list[str] = []
+    if metric or op == "top":
-        for node in nodes_in_query:
+        entry = _select_metric_entry(tokens, metric=metric, op=op)
-            if node in names:
+        if entry and isinstance(entry.get("exprs"), list) and entry["exprs"]:
-                parts.append(f"Yes. {node} is in the Atlas cluster.")
+            expr = entry["exprs"][0]
-            else:
+            if inventory:
-                parts.append(f"No. {node} is not in the Atlas cluster.")
+                scoped = _inventory_filter(
-        return " ".join(parts)
+                    inventory,
-
+                    include_hw=include_hw,
-    if any(term in q for term in ("non-raspberry", "non raspberry", "not raspberry", "non-rpi", "non rpi")):
+                    exclude_hw=exclude_hw,
-        non_rpi_sorted = sorted(non_rpi)
+                    only_workers=only_workers,
-        if any(word in q for word in ("how many", "count", "number")):
+                    only_ready=None,
-            return f"Atlas has {len(non_rpi_sorted)} non‑Raspberry Pi nodes."
+                    nodes_in_query=nodes_in_query,
        if any(phrase in q for phrase in ("besides jetson", "excluding jetson", "without jetson", "non jetson")):
            amd = sorted(groups.get("amd64", []))
            return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson."
        return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi_sorted)}." if non_rpi_sorted else "No non‑Raspberry Pi nodes found."
    if "jetson" in q:
        jets = groups.get("jetson", [])
        if any(word in q for word in ("how many", "count", "number")):
            return f"Atlas has {len(jets)} Jetson nodes."
        return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found."
    if "amd64" in q or "x86" in q:
        amd = groups.get("amd64", [])
        if any(word in q for word in ("how many", "count", "number")):
            return f"Atlas has {len(amd)} amd64 nodes."
        return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found."
    if "arm64" in q and "node" in q and any(word in q for word in ("how many", "count", "number")):
        count = sum(1 for node in inventory if node.get("arch") == "arm64")
        return f"Atlas has {count} arm64 nodes."
    if "rpi4" in q:
        rpi4 = groups.get("rpi4", [])
        if any(word in q for word in ("how many", "count", "number")):
            return f"Atlas has {len(rpi4)} rpi4 nodes."
        return f"rpi4 nodes: {', '.join(rpi4)}." if rpi4 else "No rpi4 nodes found."
    if "rpi5" in q:
        rpi5 = groups.get("rpi5", [])
        if any(word in q for word in ("how many", "count", "number")):
            return f"Atlas has {len(rpi5)} rpi5 nodes."
        return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found."
    if "raspberry" in q or "rpi" in q:
        rpi = sorted(rpi_nodes)
        if any(word in q for word in ("how many", "count", "number")):
            return f"Atlas has {len(rpi)} Raspberry Pi nodes."
        return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found."
    if "arm64-unknown" in q or "unknown" in q or "no hardware" in q:
        unknown = sorted(unknown_hw)
        return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels."
    if ("notready" in q or "not ready" in q or "unready" in q) and ("node" in q or "nodes" in q):
        return "Not ready nodes: " + (", ".join(not_ready) if not_ready else "none") + "."
    if "worker" in q and ("node" in q or "nodes" in q or "workers" in q):
        not_ready_query = "not ready" in q or "unready" in q or "down" in q or ("not" in q and "ready" in q)
        if expected_workers:
            if "missing" in q:
                return "Missing worker nodes: " + (", ".join(expected_missing) if expected_missing else "none") + "."
            if "ready" in q and ("not ready" in q or "vs" in q or "versus" in q):
                return (
                    f"Expected workers: {len(expected_ready)} ready, "
                    f"{len(expected_not_ready)} not ready (expected {len(expected_workers)})."
                )
-            if any(word in q for word in ("how many", "count", "number")) and ("expect" in q or "expected" in q or "should" in q):
+                if scoped:
-                msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
+                    node_regex = "|".join([n["name"] for n in scoped])
-                if expected_missing:
+                    expr = _apply_node_filter(expr, node_regex)
-                    msg += f" Missing: {', '.join(expected_missing)}."
+            res = vm_query(expr, timeout=20)
-                return msg
+            answer = _format_metric_answer(entry, res)
-            if not_ready_query:
+            if answer:
-                if expected_not_ready or expected_missing:
+                return answer
-                    detail = []
+        if metrics_summary:
-                    if expected_not_ready:
+            return metrics_summary
                        detail.append(f"Not ready: {', '.join(expected_not_ready)}")
                    if expected_missing:
                        detail.append(f"Missing: {', '.join(expected_missing)}")
                    return "Worker nodes needing attention. " + " ".join(detail) + "."
                return "All expected worker nodes are Ready."
            if any(word in q for word in ("expected", "expect", "should")):
                msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
                if expected_missing:
                    msg += f" Missing: {', '.join(expected_missing)}."
                return msg
            if any(word in q for word in ("how many", "count", "number")):
                return f"Worker nodes: {len(expected_ready)} ready, {len(expected_not_ready)} not ready (expected {len(expected_workers)})."
            if "ready" in q:
                return f"Ready worker nodes ({len(expected_ready)}): {', '.join(expected_ready)}."
        if not_ready_query:
            return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "."
        if any(word in q for word in ("how many", "count", "number")):
            return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready."
        return "Ready worker nodes ({}): {}.".format(len(worker_ready), ", ".join(worker_ready))
-    if any(word in q for word in ("how many", "count", "number")) and "node" in q:
+    if entity != "node" or not inventory:
-        return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready."
+        if any(word in q for word in METRIC_HINT_WORDS) and not metrics_summary:
            return "I don't have data to answer that right now."
        return ""
-    if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q:
+    expected_workers = expected_worker_nodes_from_metrics()
-        return "Atlas node names: " + ", ".join(names) + "."
+    filtered = _inventory_filter(
        inventory,
        include_hw=include_hw,
        exclude_hw=exclude_hw,
        only_workers=only_workers,
        only_ready=only_ready if op in ("status", "count") else None,
        nodes_in_query=nodes_in_query,
    )
    names = [node["name"] for node in filtered]
-    if "ready" in q and "node" in q:
+    if op == "status":
-        return f"Ready nodes ({len(ready)}): {', '.join(ready)}."
+        if "missing" in q and expected_workers:
            missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
            return "Missing nodes: " + (", ".join(missing) if missing else "none") + "."
        if only_ready is False:
            return "Not ready nodes: " + (", ".join(names) if names else "none") + "."
        if only_ready is True:
            return f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + "."
    if op == "count":
        if expected_workers and ("expected" in q or "should" in q):
            missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
            msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
            if missing:
                msg += f" Missing: {', '.join(missing)}."
            return msg
        if not (include_hw or exclude_hw or nodes_in_query or only_workers):
            return f"Atlas has {len(names)} nodes."
        return f"Matching nodes: {len(names)}."
    if op == "list":
        if nodes_in_query:
            parts = []
            existing = {n["name"] for n in inventory}
            for node in nodes_in_query:
                parts.append(f"{node}: {'present' if node in existing else 'not present'}")
            return "Node presence: " + ", ".join(parts) + "."
        if not names:
            return "Matching nodes: none."
        shown = names[:30]
        suffix = f", … (+{len(names) - 30} more)" if len(names) > 30 else ""
        return "Matching nodes: " + ", ".join(shown) + suffix + "."
    return ""
@ -727,25 +736,6 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
    fallback = _metrics_fallback_summary(panel, summary)
    return context, fallback
 def jetson_nodes_from_kb() -> list[str]:
    for doc in KB.get("runbooks", []):
        if not isinstance(doc, dict):
            continue
        body = str(doc.get("body") or "")
        for line in body.splitlines():
            if "jetson" not in line.lower():
                continue
            names = _extract_titan_nodes(line)
            if names:
                return names
    return []
 def jetson_nodes_summary(cluster_name: str) -> str:
    names = jetson_nodes_from_kb()
    if names:
        return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}."
    return ""
 def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]:
    q = (query or "").strip()
    if not q or not KB.get("catalog"):
@ -953,22 +943,16 @@ def _parse_metric_lines(summary: str) -> dict[str, str]:
 def _metrics_fallback_summary(panel: str, summary: str) -> str:
    parsed = _parse_metric_lines(summary)
    panel_l = (panel or "").lower()
-    if panel_l.startswith("postgres connections"):
+    if parsed:
-        used = parsed.get("conn=used")
+        items = list(parsed.items())
-        maxv = parsed.get("conn=max")
+        if len(items) == 1:
-        if used and maxv:
+            label, value = items[0]
-            try:
+            return f"{panel}: {label} = {value}."
-                used_i = int(float(used))
+        compact = "; ".join(f"{k}={v}" for k, v in items)
-                max_i = int(float(maxv))
+        return f"{panel}: {compact}."
-            except ValueError:
+    if panel_l:
-                return f"Postgres connections: {summary}"
+        return f"{panel}: {summary}"
-            free = max_i - used_i
+    return summary
            return f"Postgres connections: {used_i}/{max_i} used ({free} free)."
    if panel_l.startswith("postgres hottest"):
        if parsed:
            label, value = next(iter(parsed.items()))
            return f"Most Postgres connections: {label} = {value}."
    return f"{panel}: {summary}"
 def _node_ready_status(node: dict) -> bool | None:
    conditions = node.get("status", {}).get("conditions") or []
@ -1075,93 +1059,6 @@ def vm_cluster_snapshot() -> str:
        parts.append(pr)
    return "\n".join(parts).strip()
 def nodes_summary(cluster_name: str) -> str:
    state = _ariadne_state()
    if state:
        nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {}
        total = nodes.get("total")
        ready = nodes.get("ready")
        not_ready = nodes.get("not_ready")
        if isinstance(total, int) and isinstance(ready, int):
            not_ready = not_ready if isinstance(not_ready, int) else max(total - ready, 0)
            if not_ready:
                return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady."
            return f"{cluster_name} cluster has {total} nodes, all Ready."
    try:
        data = k8s_get("/api/v1/nodes?limit=500")
    except Exception:
        return ""
    items = data.get("items") or []
    if not isinstance(items, list) or not items:
        return ""
    total = len(items)
    ready = 0
    for node in items:
        conditions = node.get("status", {}).get("conditions") or []
        for cond in conditions if isinstance(conditions, list) else []:
            if cond.get("type") == "Ready":
                if cond.get("status") == "True":
                    ready += 1
                break
    not_ready = max(total - ready, 0)
    if not_ready:
        return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady."
    return f"{cluster_name} cluster has {total} nodes, all Ready."
 def nodes_names_summary(cluster_name: str) -> str:
    state = _ariadne_state()
    if state:
        nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {}
        names = nodes.get("names")
        if isinstance(names, list) and names:
            cleaned = sorted({str(n) for n in names if n})
            if len(cleaned) <= 30:
                return f"{cluster_name} node names: {', '.join(cleaned)}."
            shown = ", ".join(cleaned[:30])
            return f"{cluster_name} node names: {shown}, … (+{len(cleaned) - 30} more)."
    try:
        data = k8s_get("/api/v1/nodes?limit=500")
    except Exception:
        return ""
    items = data.get("items") or []
    if not isinstance(items, list) or not items:
        return ""
    names = []
    for node in items:
        name = (node.get("metadata") or {}).get("name") or ""
        if name:
            names.append(name)
    names = sorted(set(names))
    if not names:
        return ""
    if len(names) <= 30:
        return f"{cluster_name} node names: {', '.join(names)}."
    shown = ", ".join(names[:30])
    return f"{cluster_name} node names: {shown}, … (+{len(names) - 30} more)."
 def nodes_arch_summary(cluster_name: str, arch: str) -> str:
    try:
        data = k8s_get("/api/v1/nodes?limit=500")
    except Exception:
        return ""
    items = data.get("items") or []
    if not isinstance(items, list) or not items:
        return ""
    normalized = (arch or "").strip().lower()
    if normalized in ("aarch64", "arm64"):
        arch_label = "arm64"
    elif normalized in ("x86_64", "x86-64", "amd64"):
        arch_label = "amd64"
    else:
        arch_label = normalized
    total = 0
    for node in items:
        labels = (node.get("metadata") or {}).get("labels") or {}
        if labels.get("kubernetes.io/arch") == arch_label:
            total += 1
    return f"{cluster_name} cluster has {total} {arch_label} nodes."
 def _strip_code_fence(text: str) -> str:
    cleaned = (text or "").strip()
    match = CODE_FENCE_RE.match(cleaned)