atlasbot: replace targeted handlers with generic planner
This commit is contained in:
parent
6c413d4a50
commit
37a203509b
@ -95,11 +95,29 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
|
|||||||
TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
|
TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
|
||||||
TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
|
TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
|
||||||
_DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
|
_DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
|
||||||
HOTTEST_QUERIES = {
|
|
||||||
"cpu": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
OPERATION_HINTS = {
|
||||||
"ram": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
"count": ("how many", "count", "number", "total"),
|
||||||
"net": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
"list": ("list", "which", "what are", "show", "names"),
|
||||||
"io": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
"top": ("top", "hottest", "highest", "most", "largest", "max", "maximum"),
|
||||||
|
"status": ("ready", "not ready", "unready", "down", "missing", "status"),
|
||||||
|
}
|
||||||
|
|
||||||
|
METRIC_HINTS = {
|
||||||
|
"cpu": ("cpu",),
|
||||||
|
"ram": ("ram", "memory", "mem"),
|
||||||
|
"net": ("net", "network", "bandwidth", "throughput"),
|
||||||
|
"io": ("io", "disk", "storage"),
|
||||||
|
"connections": ("connections", "conn", "postgres", "database", "db"),
|
||||||
|
}
|
||||||
|
|
||||||
|
HARDWARE_HINTS = {
|
||||||
|
"amd64": ("amd64", "x86", "x86_64", "x86-64"),
|
||||||
|
"jetson": ("jetson",),
|
||||||
|
"rpi4": ("rpi4",),
|
||||||
|
"rpi5": ("rpi5",),
|
||||||
|
"rpi": ("rpi", "raspberry"),
|
||||||
|
"arm64": ("arm64", "aarch64"),
|
||||||
}
|
}
|
||||||
|
|
||||||
def normalize_query(text: str) -> str:
|
def normalize_query(text: str) -> str:
|
||||||
@ -312,63 +330,127 @@ def _humanize_rate(value: str, *, unit: str) -> str:
|
|||||||
return f"{val / 1024:.2f} KB/s"
|
return f"{val / 1024:.2f} KB/s"
|
||||||
return f"{val:.2f} B/s"
|
return f"{val:.2f} B/s"
|
||||||
|
|
||||||
def _hottest_query(metric: str, node_regex: str | None) -> str:
|
def _has_any(text: str, phrases: tuple[str, ...]) -> bool:
|
||||||
expr = HOTTEST_QUERIES[metric]
|
return any(p in text for p in phrases)
|
||||||
if node_regex:
|
|
||||||
|
def _detect_operation(q: str) -> str | None:
|
||||||
|
for op, phrases in OPERATION_HINTS.items():
|
||||||
|
if _has_any(q, phrases):
|
||||||
|
return op
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _detect_metric(q: str) -> str | None:
|
||||||
|
for metric, phrases in METRIC_HINTS.items():
|
||||||
|
if _has_any(q, phrases):
|
||||||
|
return metric
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]:
|
||||||
|
include: set[str] = set()
|
||||||
|
exclude: set[str] = set()
|
||||||
|
for hardware, phrases in HARDWARE_HINTS.items():
|
||||||
|
for phrase in phrases:
|
||||||
|
if f"non {phrase}" in q or f"non-{phrase}" in q or f"not {phrase}" in q:
|
||||||
|
exclude.add(hardware)
|
||||||
|
elif phrase in q:
|
||||||
|
include.add(hardware)
|
||||||
|
return include, exclude
|
||||||
|
|
||||||
|
def _detect_entity(q: str) -> str | None:
|
||||||
|
if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q):
|
||||||
|
return "node"
|
||||||
|
if "pod" in q or "pods" in q:
|
||||||
|
return "pod"
|
||||||
|
if "namespace" in q or "namespaces" in q:
|
||||||
|
return "namespace"
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _metric_entry_score(entry: dict[str, Any], tokens: list[str], *, metric: str | None, op: str | None) -> int:
|
||||||
|
hay = _metric_tokens(entry)
|
||||||
|
score = 0
|
||||||
|
for t in set(tokens):
|
||||||
|
if t in hay:
|
||||||
|
score += 2 if t in (entry.get("panel_title") or "").lower() else 1
|
||||||
|
if metric:
|
||||||
|
for phrase in METRIC_HINTS.get(metric, (metric,)):
|
||||||
|
if phrase in hay:
|
||||||
|
score += 3
|
||||||
|
if op == "top" and ("hottest" in hay or "top" in hay):
|
||||||
|
score += 3
|
||||||
|
if "node" in hay:
|
||||||
|
score += 1
|
||||||
|
return score
|
||||||
|
|
||||||
|
def _select_metric_entry(tokens: list[str], *, metric: str | None, op: str | None) -> dict[str, Any] | None:
|
||||||
|
scored: list[tuple[int, dict[str, Any]]] = []
|
||||||
|
for entry in _METRIC_INDEX:
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
score = _metric_entry_score(entry, tokens, metric=metric, op=op)
|
||||||
|
if score:
|
||||||
|
scored.append((score, entry))
|
||||||
|
if not scored:
|
||||||
|
return None
|
||||||
|
scored.sort(key=lambda item: item[0], reverse=True)
|
||||||
|
return scored[0][1]
|
||||||
|
|
||||||
|
def _apply_node_filter(expr: str, node_regex: str | None) -> str:
|
||||||
|
if not node_regex:
|
||||||
|
return expr
|
||||||
needle = 'node_uname_info{nodename!=""}'
|
needle = 'node_uname_info{nodename!=""}'
|
||||||
replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}'
|
replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}'
|
||||||
return expr.replace(needle, replacement)
|
return expr.replace(needle, replacement)
|
||||||
return expr
|
|
||||||
|
|
||||||
def _vm_hottest(metric: str, node_regex: str | None) -> tuple[str, str] | None:
|
def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str:
|
||||||
expr = _hottest_query(metric, node_regex)
|
|
||||||
res = vm_query(expr)
|
|
||||||
series = _vm_value_series(res)
|
series = _vm_value_series(res)
|
||||||
|
panel = entry.get("panel_title") or "Metric"
|
||||||
if not series:
|
if not series:
|
||||||
return None
|
|
||||||
first = series[0]
|
|
||||||
labels = first.get("metric") or {}
|
|
||||||
value = first.get("value") or []
|
|
||||||
val = value[1] if isinstance(value, list) and len(value) > 1 else ""
|
|
||||||
node = labels.get("node") or labels.get("__name__") or ""
|
|
||||||
if not node:
|
|
||||||
return None
|
|
||||||
return (str(node), str(val))
|
|
||||||
|
|
||||||
def _hottest_answer(q: str, *, nodes: list[str] | None) -> str:
|
|
||||||
metric = None
|
|
||||||
assumed_cpu = False
|
|
||||||
if "cpu" in q:
|
|
||||||
metric = "cpu"
|
|
||||||
elif "ram" in q or "memory" in q:
|
|
||||||
metric = "ram"
|
|
||||||
elif "net" in q or "network" in q:
|
|
||||||
metric = "net"
|
|
||||||
elif "io" in q or "disk" in q or "storage" in q:
|
|
||||||
metric = "io"
|
|
||||||
if metric is None:
|
|
||||||
metric = "cpu"
|
|
||||||
assumed_cpu = True
|
|
||||||
if nodes is not None and not nodes:
|
|
||||||
return "No nodes match the requested hardware class."
|
|
||||||
|
|
||||||
node_regex = "|".join(nodes) if nodes else None
|
|
||||||
metrics = [metric]
|
|
||||||
lines: list[str] = []
|
|
||||||
for m in metrics:
|
|
||||||
picked = _vm_hottest(m, node_regex)
|
|
||||||
if not picked:
|
|
||||||
continue
|
|
||||||
node, val = picked
|
|
||||||
unit = "%" if m in ("cpu", "ram") else "B/s"
|
|
||||||
val_str = _humanize_rate(val, unit=unit)
|
|
||||||
label = {"cpu": "CPU", "ram": "RAM", "net": "NET", "io": "I/O"}[m]
|
|
||||||
lines.append(f"{label}: {node} ({val_str})")
|
|
||||||
if not lines:
|
|
||||||
return ""
|
return ""
|
||||||
label = metric.upper()
|
rendered = vm_render_result(res, limit=5)
|
||||||
suffix = " (defaulting to CPU)" if assumed_cpu else ""
|
if not rendered:
|
||||||
return f"Hottest node by {label}: {lines[0].split(': ', 1)[1]}.{suffix}"
|
return ""
|
||||||
|
lines = [line.lstrip("-").strip() for line in rendered.splitlines() if line.strip().startswith("-")]
|
||||||
|
if len(lines) == 1:
|
||||||
|
return f"{panel}: {lines[0]}."
|
||||||
|
return f"{panel}:\n" + "\n".join(f"- {line}" for line in lines)
|
||||||
|
|
||||||
|
def _inventory_filter(
|
||||||
|
inventory: list[dict[str, Any]],
|
||||||
|
*,
|
||||||
|
include_hw: set[str],
|
||||||
|
exclude_hw: set[str],
|
||||||
|
only_workers: bool,
|
||||||
|
only_ready: bool | None,
|
||||||
|
nodes_in_query: list[str],
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
results = inventory
|
||||||
|
if nodes_in_query:
|
||||||
|
results = [node for node in results if node.get("name") in nodes_in_query]
|
||||||
|
if only_workers:
|
||||||
|
results = [node for node in results if node.get("is_worker") is True]
|
||||||
|
if only_ready is True:
|
||||||
|
results = [node for node in results if node.get("ready") is True]
|
||||||
|
if only_ready is False:
|
||||||
|
results = [node for node in results if node.get("ready") is False]
|
||||||
|
if include_hw:
|
||||||
|
results = [node for node in results if _hardware_match(node, include_hw)]
|
||||||
|
if exclude_hw:
|
||||||
|
results = [node for node in results if not _hardware_match(node, exclude_hw)]
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _hardware_match(node: dict[str, Any], filters: set[str]) -> bool:
|
||||||
|
hw = node.get("hardware") or ""
|
||||||
|
arch = node.get("arch") or ""
|
||||||
|
for f in filters:
|
||||||
|
if f == "rpi" and hw in ("rpi4", "rpi5"):
|
||||||
|
return True
|
||||||
|
if f == "arm64" and arch == "arm64":
|
||||||
|
return True
|
||||||
|
if hw == f:
|
||||||
|
return True
|
||||||
|
if f == "amd64" and arch == "amd64":
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def _node_roles(labels: dict[str, Any]) -> list[str]:
|
def _node_roles(labels: dict[str, Any]) -> list[str]:
|
||||||
roles: list[str] = []
|
roles: list[str] = []
|
||||||
@ -495,176 +577,103 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
|
|||||||
|
|
||||||
def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str:
|
def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str:
|
||||||
q = normalize_query(prompt)
|
q = normalize_query(prompt)
|
||||||
if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")):
|
if not q:
|
||||||
return metrics_summary
|
|
||||||
|
|
||||||
if not inventory:
|
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
sets = _inventory_sets(inventory)
|
tokens = _tokens(q)
|
||||||
names = sets["names"]
|
op = _detect_operation(q)
|
||||||
ready = sets["ready"]
|
metric = _detect_metric(q)
|
||||||
not_ready = sets["not_ready"]
|
entity = _detect_entity(q)
|
||||||
groups = sets["groups"]
|
include_hw, exclude_hw = _detect_hardware_filters(q)
|
||||||
worker_names = sets["worker_names"]
|
|
||||||
worker_ready = sets["worker_ready"]
|
|
||||||
worker_not_ready = sets["worker_not_ready"]
|
|
||||||
expected_workers = sets["expected_workers"]
|
|
||||||
expected_ready = sets["expected_ready"]
|
|
||||||
expected_not_ready = sets["expected_not_ready"]
|
|
||||||
expected_missing = sets["expected_missing"]
|
|
||||||
total = len(names)
|
|
||||||
nodes_in_query = _extract_titan_nodes(q)
|
nodes_in_query = _extract_titan_nodes(q)
|
||||||
rpi_nodes = set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))
|
only_workers = "worker" in q or "workers" in q
|
||||||
non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", []))
|
only_ready: bool | None = None
|
||||||
unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))
|
if "not ready" in q or "unready" in q or "down" in q or "missing" in q:
|
||||||
|
only_ready = False
|
||||||
|
elif "ready" in q:
|
||||||
|
only_ready = True
|
||||||
|
|
||||||
if "hottest" in q or "hot" in q:
|
if entity == "node" and only_ready is not None and op != "count":
|
||||||
filter_nodes: list[str] | None = None
|
op = "status"
|
||||||
if "amd64" in q or "x86" in q:
|
|
||||||
filter_nodes = sorted(groups.get("amd64", []))
|
|
||||||
elif "jetson" in q:
|
|
||||||
filter_nodes = sorted(groups.get("jetson", []))
|
|
||||||
elif "raspberry" in q or "rpi" in q:
|
|
||||||
filter_nodes = sorted(rpi_nodes)
|
|
||||||
elif "arm64" in q:
|
|
||||||
filter_nodes = sorted([n for n in names if n not in groups.get("amd64", [])])
|
|
||||||
hottest = _hottest_answer(q, nodes=filter_nodes)
|
|
||||||
if hottest:
|
|
||||||
return hottest
|
|
||||||
return "Unable to determine hottest nodes right now (metrics unavailable)."
|
|
||||||
|
|
||||||
if nodes_in_query and ("raspberry" in q or "rpi" in q):
|
if not op and entity == "node":
|
||||||
parts: list[str] = []
|
op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count"
|
||||||
for node in nodes_in_query:
|
|
||||||
if node in rpi_nodes:
|
|
||||||
parts.append(f"{node} is a Raspberry Pi node.")
|
|
||||||
elif node in non_rpi:
|
|
||||||
parts.append(f"{node} is not a Raspberry Pi node.")
|
|
||||||
elif node in names:
|
|
||||||
parts.append(f"{node} is in Atlas but hardware is unknown.")
|
|
||||||
else:
|
|
||||||
parts.append(f"{node} is not in the Atlas cluster.")
|
|
||||||
return " ".join(parts)
|
|
||||||
|
|
||||||
if nodes_in_query and "jetson" in q:
|
if op == "top" and metric is None:
|
||||||
jets = set(groups.get("jetson", []))
|
metric = "cpu"
|
||||||
parts = []
|
|
||||||
for node in nodes_in_query:
|
|
||||||
if node in jets:
|
|
||||||
parts.append(f"{node} is a Jetson node.")
|
|
||||||
elif node in names:
|
|
||||||
parts.append(f"{node} is not a Jetson node.")
|
|
||||||
else:
|
|
||||||
parts.append(f"{node} is not in the Atlas cluster.")
|
|
||||||
return " ".join(parts)
|
|
||||||
|
|
||||||
if nodes_in_query and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q or "present" in q or "exist" in q):
|
# Metrics-first when a metric or top operation is requested.
|
||||||
parts: list[str] = []
|
if metric or op == "top":
|
||||||
for node in nodes_in_query:
|
entry = _select_metric_entry(tokens, metric=metric, op=op)
|
||||||
if node in names:
|
if entry and isinstance(entry.get("exprs"), list) and entry["exprs"]:
|
||||||
parts.append(f"Yes. {node} is in the Atlas cluster.")
|
expr = entry["exprs"][0]
|
||||||
else:
|
if inventory:
|
||||||
parts.append(f"No. {node} is not in the Atlas cluster.")
|
scoped = _inventory_filter(
|
||||||
return " ".join(parts)
|
inventory,
|
||||||
|
include_hw=include_hw,
|
||||||
if any(term in q for term in ("non-raspberry", "non raspberry", "not raspberry", "non-rpi", "non rpi")):
|
exclude_hw=exclude_hw,
|
||||||
non_rpi_sorted = sorted(non_rpi)
|
only_workers=only_workers,
|
||||||
if any(word in q for word in ("how many", "count", "number")):
|
only_ready=None,
|
||||||
return f"Atlas has {len(non_rpi_sorted)} non‑Raspberry Pi nodes."
|
nodes_in_query=nodes_in_query,
|
||||||
if any(phrase in q for phrase in ("besides jetson", "excluding jetson", "without jetson", "non jetson")):
|
|
||||||
amd = sorted(groups.get("amd64", []))
|
|
||||||
return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson."
|
|
||||||
return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi_sorted)}." if non_rpi_sorted else "No non‑Raspberry Pi nodes found."
|
|
||||||
|
|
||||||
if "jetson" in q:
|
|
||||||
jets = groups.get("jetson", [])
|
|
||||||
if any(word in q for word in ("how many", "count", "number")):
|
|
||||||
return f"Atlas has {len(jets)} Jetson nodes."
|
|
||||||
return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found."
|
|
||||||
|
|
||||||
if "amd64" in q or "x86" in q:
|
|
||||||
amd = groups.get("amd64", [])
|
|
||||||
if any(word in q for word in ("how many", "count", "number")):
|
|
||||||
return f"Atlas has {len(amd)} amd64 nodes."
|
|
||||||
return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found."
|
|
||||||
|
|
||||||
if "arm64" in q and "node" in q and any(word in q for word in ("how many", "count", "number")):
|
|
||||||
count = sum(1 for node in inventory if node.get("arch") == "arm64")
|
|
||||||
return f"Atlas has {count} arm64 nodes."
|
|
||||||
|
|
||||||
if "rpi4" in q:
|
|
||||||
rpi4 = groups.get("rpi4", [])
|
|
||||||
if any(word in q for word in ("how many", "count", "number")):
|
|
||||||
return f"Atlas has {len(rpi4)} rpi4 nodes."
|
|
||||||
return f"rpi4 nodes: {', '.join(rpi4)}." if rpi4 else "No rpi4 nodes found."
|
|
||||||
|
|
||||||
if "rpi5" in q:
|
|
||||||
rpi5 = groups.get("rpi5", [])
|
|
||||||
if any(word in q for word in ("how many", "count", "number")):
|
|
||||||
return f"Atlas has {len(rpi5)} rpi5 nodes."
|
|
||||||
return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found."
|
|
||||||
|
|
||||||
if "raspberry" in q or "rpi" in q:
|
|
||||||
rpi = sorted(rpi_nodes)
|
|
||||||
if any(word in q for word in ("how many", "count", "number")):
|
|
||||||
return f"Atlas has {len(rpi)} Raspberry Pi nodes."
|
|
||||||
return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found."
|
|
||||||
|
|
||||||
if "arm64-unknown" in q or "unknown" in q or "no hardware" in q:
|
|
||||||
unknown = sorted(unknown_hw)
|
|
||||||
return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels."
|
|
||||||
|
|
||||||
if ("notready" in q or "not ready" in q or "unready" in q) and ("node" in q or "nodes" in q):
|
|
||||||
return "Not ready nodes: " + (", ".join(not_ready) if not_ready else "none") + "."
|
|
||||||
|
|
||||||
if "worker" in q and ("node" in q or "nodes" in q or "workers" in q):
|
|
||||||
not_ready_query = "not ready" in q or "unready" in q or "down" in q or ("not" in q and "ready" in q)
|
|
||||||
if expected_workers:
|
|
||||||
if "missing" in q:
|
|
||||||
return "Missing worker nodes: " + (", ".join(expected_missing) if expected_missing else "none") + "."
|
|
||||||
if "ready" in q and ("not ready" in q or "vs" in q or "versus" in q):
|
|
||||||
return (
|
|
||||||
f"Expected workers: {len(expected_ready)} ready, "
|
|
||||||
f"{len(expected_not_ready)} not ready (expected {len(expected_workers)})."
|
|
||||||
)
|
)
|
||||||
if any(word in q for word in ("how many", "count", "number")) and ("expect" in q or "expected" in q or "should" in q):
|
if scoped:
|
||||||
|
node_regex = "|".join([n["name"] for n in scoped])
|
||||||
|
expr = _apply_node_filter(expr, node_regex)
|
||||||
|
res = vm_query(expr, timeout=20)
|
||||||
|
answer = _format_metric_answer(entry, res)
|
||||||
|
if answer:
|
||||||
|
return answer
|
||||||
|
if metrics_summary:
|
||||||
|
return metrics_summary
|
||||||
|
|
||||||
|
if entity != "node" or not inventory:
|
||||||
|
if any(word in q for word in METRIC_HINT_WORDS) and not metrics_summary:
|
||||||
|
return "I don't have data to answer that right now."
|
||||||
|
return ""
|
||||||
|
|
||||||
|
expected_workers = expected_worker_nodes_from_metrics()
|
||||||
|
filtered = _inventory_filter(
|
||||||
|
inventory,
|
||||||
|
include_hw=include_hw,
|
||||||
|
exclude_hw=exclude_hw,
|
||||||
|
only_workers=only_workers,
|
||||||
|
only_ready=only_ready if op in ("status", "count") else None,
|
||||||
|
nodes_in_query=nodes_in_query,
|
||||||
|
)
|
||||||
|
names = [node["name"] for node in filtered]
|
||||||
|
|
||||||
|
if op == "status":
|
||||||
|
if "missing" in q and expected_workers:
|
||||||
|
missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
|
||||||
|
return "Missing nodes: " + (", ".join(missing) if missing else "none") + "."
|
||||||
|
if only_ready is False:
|
||||||
|
return "Not ready nodes: " + (", ".join(names) if names else "none") + "."
|
||||||
|
if only_ready is True:
|
||||||
|
return f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + "."
|
||||||
|
|
||||||
|
if op == "count":
|
||||||
|
if expected_workers and ("expected" in q or "should" in q):
|
||||||
|
missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
|
||||||
msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
|
msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
|
||||||
if expected_missing:
|
if missing:
|
||||||
msg += f" Missing: {', '.join(expected_missing)}."
|
msg += f" Missing: {', '.join(missing)}."
|
||||||
return msg
|
return msg
|
||||||
if not_ready_query:
|
if not (include_hw or exclude_hw or nodes_in_query or only_workers):
|
||||||
if expected_not_ready or expected_missing:
|
return f"Atlas has {len(names)} nodes."
|
||||||
detail = []
|
return f"Matching nodes: {len(names)}."
|
||||||
if expected_not_ready:
|
|
||||||
detail.append(f"Not ready: {', '.join(expected_not_ready)}")
|
|
||||||
if expected_missing:
|
|
||||||
detail.append(f"Missing: {', '.join(expected_missing)}")
|
|
||||||
return "Worker nodes needing attention. " + " ".join(detail) + "."
|
|
||||||
return "All expected worker nodes are Ready."
|
|
||||||
if any(word in q for word in ("expected", "expect", "should")):
|
|
||||||
msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
|
|
||||||
if expected_missing:
|
|
||||||
msg += f" Missing: {', '.join(expected_missing)}."
|
|
||||||
return msg
|
|
||||||
if any(word in q for word in ("how many", "count", "number")):
|
|
||||||
return f"Worker nodes: {len(expected_ready)} ready, {len(expected_not_ready)} not ready (expected {len(expected_workers)})."
|
|
||||||
if "ready" in q:
|
|
||||||
return f"Ready worker nodes ({len(expected_ready)}): {', '.join(expected_ready)}."
|
|
||||||
if not_ready_query:
|
|
||||||
return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "."
|
|
||||||
if any(word in q for word in ("how many", "count", "number")):
|
|
||||||
return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready."
|
|
||||||
return "Ready worker nodes ({}): {}.".format(len(worker_ready), ", ".join(worker_ready))
|
|
||||||
|
|
||||||
if any(word in q for word in ("how many", "count", "number")) and "node" in q:
|
if op == "list":
|
||||||
return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready."
|
if nodes_in_query:
|
||||||
|
parts = []
|
||||||
if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q:
|
existing = {n["name"] for n in inventory}
|
||||||
return "Atlas node names: " + ", ".join(names) + "."
|
for node in nodes_in_query:
|
||||||
|
parts.append(f"{node}: {'present' if node in existing else 'not present'}")
|
||||||
if "ready" in q and "node" in q:
|
return "Node presence: " + ", ".join(parts) + "."
|
||||||
return f"Ready nodes ({len(ready)}): {', '.join(ready)}."
|
if not names:
|
||||||
|
return "Matching nodes: none."
|
||||||
|
shown = names[:30]
|
||||||
|
suffix = f", … (+{len(names) - 30} more)" if len(names) > 30 else ""
|
||||||
|
return "Matching nodes: " + ", ".join(shown) + suffix + "."
|
||||||
|
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
@ -727,25 +736,6 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
|
|||||||
fallback = _metrics_fallback_summary(panel, summary)
|
fallback = _metrics_fallback_summary(panel, summary)
|
||||||
return context, fallback
|
return context, fallback
|
||||||
|
|
||||||
def jetson_nodes_from_kb() -> list[str]:
|
|
||||||
for doc in KB.get("runbooks", []):
|
|
||||||
if not isinstance(doc, dict):
|
|
||||||
continue
|
|
||||||
body = str(doc.get("body") or "")
|
|
||||||
for line in body.splitlines():
|
|
||||||
if "jetson" not in line.lower():
|
|
||||||
continue
|
|
||||||
names = _extract_titan_nodes(line)
|
|
||||||
if names:
|
|
||||||
return names
|
|
||||||
return []
|
|
||||||
|
|
||||||
def jetson_nodes_summary(cluster_name: str) -> str:
|
|
||||||
names = jetson_nodes_from_kb()
|
|
||||||
if names:
|
|
||||||
return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}."
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]:
|
def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]:
|
||||||
q = (query or "").strip()
|
q = (query or "").strip()
|
||||||
if not q or not KB.get("catalog"):
|
if not q or not KB.get("catalog"):
|
||||||
@ -953,22 +943,16 @@ def _parse_metric_lines(summary: str) -> dict[str, str]:
|
|||||||
def _metrics_fallback_summary(panel: str, summary: str) -> str:
|
def _metrics_fallback_summary(panel: str, summary: str) -> str:
|
||||||
parsed = _parse_metric_lines(summary)
|
parsed = _parse_metric_lines(summary)
|
||||||
panel_l = (panel or "").lower()
|
panel_l = (panel or "").lower()
|
||||||
if panel_l.startswith("postgres connections"):
|
|
||||||
used = parsed.get("conn=used")
|
|
||||||
maxv = parsed.get("conn=max")
|
|
||||||
if used and maxv:
|
|
||||||
try:
|
|
||||||
used_i = int(float(used))
|
|
||||||
max_i = int(float(maxv))
|
|
||||||
except ValueError:
|
|
||||||
return f"Postgres connections: {summary}"
|
|
||||||
free = max_i - used_i
|
|
||||||
return f"Postgres connections: {used_i}/{max_i} used ({free} free)."
|
|
||||||
if panel_l.startswith("postgres hottest"):
|
|
||||||
if parsed:
|
if parsed:
|
||||||
label, value = next(iter(parsed.items()))
|
items = list(parsed.items())
|
||||||
return f"Most Postgres connections: {label} = {value}."
|
if len(items) == 1:
|
||||||
|
label, value = items[0]
|
||||||
|
return f"{panel}: {label} = {value}."
|
||||||
|
compact = "; ".join(f"{k}={v}" for k, v in items)
|
||||||
|
return f"{panel}: {compact}."
|
||||||
|
if panel_l:
|
||||||
return f"{panel}: {summary}"
|
return f"{panel}: {summary}"
|
||||||
|
return summary
|
||||||
|
|
||||||
def _node_ready_status(node: dict) -> bool | None:
|
def _node_ready_status(node: dict) -> bool | None:
|
||||||
conditions = node.get("status", {}).get("conditions") or []
|
conditions = node.get("status", {}).get("conditions") or []
|
||||||
@ -1075,93 +1059,6 @@ def vm_cluster_snapshot() -> str:
|
|||||||
parts.append(pr)
|
parts.append(pr)
|
||||||
return "\n".join(parts).strip()
|
return "\n".join(parts).strip()
|
||||||
|
|
||||||
def nodes_summary(cluster_name: str) -> str:
|
|
||||||
state = _ariadne_state()
|
|
||||||
if state:
|
|
||||||
nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {}
|
|
||||||
total = nodes.get("total")
|
|
||||||
ready = nodes.get("ready")
|
|
||||||
not_ready = nodes.get("not_ready")
|
|
||||||
if isinstance(total, int) and isinstance(ready, int):
|
|
||||||
not_ready = not_ready if isinstance(not_ready, int) else max(total - ready, 0)
|
|
||||||
if not_ready:
|
|
||||||
return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady."
|
|
||||||
return f"{cluster_name} cluster has {total} nodes, all Ready."
|
|
||||||
try:
|
|
||||||
data = k8s_get("/api/v1/nodes?limit=500")
|
|
||||||
except Exception:
|
|
||||||
return ""
|
|
||||||
items = data.get("items") or []
|
|
||||||
if not isinstance(items, list) or not items:
|
|
||||||
return ""
|
|
||||||
total = len(items)
|
|
||||||
ready = 0
|
|
||||||
for node in items:
|
|
||||||
conditions = node.get("status", {}).get("conditions") or []
|
|
||||||
for cond in conditions if isinstance(conditions, list) else []:
|
|
||||||
if cond.get("type") == "Ready":
|
|
||||||
if cond.get("status") == "True":
|
|
||||||
ready += 1
|
|
||||||
break
|
|
||||||
not_ready = max(total - ready, 0)
|
|
||||||
if not_ready:
|
|
||||||
return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady."
|
|
||||||
return f"{cluster_name} cluster has {total} nodes, all Ready."
|
|
||||||
|
|
||||||
def nodes_names_summary(cluster_name: str) -> str:
|
|
||||||
state = _ariadne_state()
|
|
||||||
if state:
|
|
||||||
nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {}
|
|
||||||
names = nodes.get("names")
|
|
||||||
if isinstance(names, list) and names:
|
|
||||||
cleaned = sorted({str(n) for n in names if n})
|
|
||||||
if len(cleaned) <= 30:
|
|
||||||
return f"{cluster_name} node names: {', '.join(cleaned)}."
|
|
||||||
shown = ", ".join(cleaned[:30])
|
|
||||||
return f"{cluster_name} node names: {shown}, … (+{len(cleaned) - 30} more)."
|
|
||||||
try:
|
|
||||||
data = k8s_get("/api/v1/nodes?limit=500")
|
|
||||||
except Exception:
|
|
||||||
return ""
|
|
||||||
items = data.get("items") or []
|
|
||||||
if not isinstance(items, list) or not items:
|
|
||||||
return ""
|
|
||||||
names = []
|
|
||||||
for node in items:
|
|
||||||
name = (node.get("metadata") or {}).get("name") or ""
|
|
||||||
if name:
|
|
||||||
names.append(name)
|
|
||||||
names = sorted(set(names))
|
|
||||||
if not names:
|
|
||||||
return ""
|
|
||||||
if len(names) <= 30:
|
|
||||||
return f"{cluster_name} node names: {', '.join(names)}."
|
|
||||||
shown = ", ".join(names[:30])
|
|
||||||
return f"{cluster_name} node names: {shown}, … (+{len(names) - 30} more)."
|
|
||||||
|
|
||||||
|
|
||||||
def nodes_arch_summary(cluster_name: str, arch: str) -> str:
|
|
||||||
try:
|
|
||||||
data = k8s_get("/api/v1/nodes?limit=500")
|
|
||||||
except Exception:
|
|
||||||
return ""
|
|
||||||
items = data.get("items") or []
|
|
||||||
if not isinstance(items, list) or not items:
|
|
||||||
return ""
|
|
||||||
normalized = (arch or "").strip().lower()
|
|
||||||
if normalized in ("aarch64", "arm64"):
|
|
||||||
arch_label = "arm64"
|
|
||||||
elif normalized in ("x86_64", "x86-64", "amd64"):
|
|
||||||
arch_label = "amd64"
|
|
||||||
else:
|
|
||||||
arch_label = normalized
|
|
||||||
total = 0
|
|
||||||
for node in items:
|
|
||||||
labels = (node.get("metadata") or {}).get("labels") or {}
|
|
||||||
if labels.get("kubernetes.io/arch") == arch_label:
|
|
||||||
total += 1
|
|
||||||
return f"{cluster_name} cluster has {total} {arch_label} nodes."
|
|
||||||
|
|
||||||
def _strip_code_fence(text: str) -> str:
|
def _strip_code_fence(text: str) -> str:
|
||||||
cleaned = (text or "").strip()
|
cleaned = (text or "").strip()
|
||||||
match = CODE_FENCE_RE.match(cleaned)
|
match = CODE_FENCE_RE.match(cleaned)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user