atlasbot: enrich snapshot facts and pod metrics
This commit is contained in:
parent
41b131c347
commit
b7f454b790
@ -16,7 +16,7 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
app: atlasbot
|
app: atlasbot
|
||||||
annotations:
|
annotations:
|
||||||
checksum/atlasbot-configmap: manual-atlasbot-35
|
checksum/atlasbot-configmap: manual-atlasbot-36
|
||||||
vault.hashicorp.com/agent-inject: "true"
|
vault.hashicorp.com/agent-inject: "true"
|
||||||
vault.hashicorp.com/role: "comms"
|
vault.hashicorp.com/role: "comms"
|
||||||
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
|
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
|
||||||
|
|||||||
@ -95,6 +95,8 @@ METRIC_HINT_WORDS = {
|
|||||||
"pending",
|
"pending",
|
||||||
"unreachable",
|
"unreachable",
|
||||||
"latency",
|
"latency",
|
||||||
|
"pod",
|
||||||
|
"pods",
|
||||||
}
|
}
|
||||||
|
|
||||||
CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
|
CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
|
||||||
@ -116,6 +118,7 @@ METRIC_HINTS = {
|
|||||||
"net": ("net", "network", "bandwidth", "throughput"),
|
"net": ("net", "network", "bandwidth", "throughput"),
|
||||||
"io": ("io", "disk", "storage"),
|
"io": ("io", "disk", "storage"),
|
||||||
"connections": ("connections", "conn", "postgres", "database", "db"),
|
"connections": ("connections", "conn", "postgres", "database", "db"),
|
||||||
|
"pods": ("pods", "pod"),
|
||||||
}
|
}
|
||||||
|
|
||||||
_OLLAMA_LOCK = threading.Lock()
|
_OLLAMA_LOCK = threading.Lock()
|
||||||
@ -488,13 +491,15 @@ def _metric_expr_uses_percent(entry: dict[str, Any]) -> bool:
|
|||||||
return "* 100" in expr or "*100" in expr
|
return "* 100" in expr or "*100" in expr
|
||||||
|
|
||||||
|
|
||||||
def _format_metric_value(value: str, *, percent: bool) -> str:
|
def _format_metric_value(value: str, *, percent: bool, rate: bool = False) -> str:
|
||||||
try:
|
try:
|
||||||
num = float(value)
|
num = float(value)
|
||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
return value
|
return value
|
||||||
if percent:
|
if percent:
|
||||||
return f"{num:.1f}%"
|
return f"{num:.1f}%"
|
||||||
|
if rate:
|
||||||
|
return _humanize_rate(value, unit="rate")
|
||||||
if abs(num) >= 1:
|
if abs(num) >= 1:
|
||||||
return f"{num:.2f}".rstrip("0").rstrip(".")
|
return f"{num:.2f}".rstrip("0").rstrip(".")
|
||||||
return f"{num:.4f}".rstrip("0").rstrip(".")
|
return f"{num:.4f}".rstrip("0").rstrip(".")
|
||||||
@ -779,6 +784,11 @@ def facts_context(
|
|||||||
lines: list[str] = ["Facts (live snapshot):"]
|
lines: list[str] = ["Facts (live snapshot):"]
|
||||||
if total is not None:
|
if total is not None:
|
||||||
lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}")
|
lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}")
|
||||||
|
if isinstance(summary, dict):
|
||||||
|
by_arch_counts = summary.get("by_arch")
|
||||||
|
if isinstance(by_arch_counts, dict) and by_arch_counts:
|
||||||
|
parts = [f"{arch}={count}" for arch, count in sorted(by_arch_counts.items())]
|
||||||
|
lines.append(f"- nodes_by_arch: {', '.join(parts)}")
|
||||||
if not_ready_names:
|
if not_ready_names:
|
||||||
lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}")
|
lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}")
|
||||||
for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
|
for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
|
||||||
@ -799,7 +809,7 @@ def facts_context(
|
|||||||
lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}")
|
lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}")
|
||||||
if not_ready_workers:
|
if not_ready_workers:
|
||||||
lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}")
|
lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}")
|
||||||
if expected_workers:
|
if expected_workers and any(word in normalize_query(prompt) for word in ("missing", "expected", "should", "not ready", "unready")):
|
||||||
missing = sorted(
|
missing = sorted(
|
||||||
set(expected_workers)
|
set(expected_workers)
|
||||||
- {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")}
|
- {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")}
|
||||||
@ -814,7 +824,11 @@ def facts_context(
|
|||||||
node = entry.get("node")
|
node = entry.get("node")
|
||||||
value = entry.get("value")
|
value = entry.get("value")
|
||||||
if node and value is not None:
|
if node and value is not None:
|
||||||
value_fmt = _format_metric_value(str(value), percent=key in ("cpu", "ram"))
|
value_fmt = _format_metric_value(
|
||||||
|
str(value),
|
||||||
|
percent=key in ("cpu", "ram"),
|
||||||
|
rate=key in ("net", "io"),
|
||||||
|
)
|
||||||
lines.append(f"- hottest_{key}: {node} ({value_fmt})")
|
lines.append(f"- hottest_{key}: {node} ({value_fmt})")
|
||||||
|
|
||||||
postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
|
postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
|
||||||
@ -829,6 +843,11 @@ def facts_context(
|
|||||||
f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})"
|
f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
for key in ("pods_running", "pods_pending", "pods_failed", "pods_succeeded"):
|
||||||
|
value = metrics.get(key)
|
||||||
|
if value is not None:
|
||||||
|
lines.append(f"- {key}: {value}")
|
||||||
|
|
||||||
usage_table = _node_usage_table(metrics)
|
usage_table = _node_usage_table(metrics)
|
||||||
if usage_table:
|
if usage_table:
|
||||||
lines.append("- node_usage (cpu/ram/net/io):")
|
lines.append("- node_usage (cpu/ram/net/io):")
|
||||||
@ -838,8 +857,16 @@ def facts_context(
|
|||||||
continue
|
continue
|
||||||
cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else ""
|
cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else ""
|
||||||
ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else ""
|
ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else ""
|
||||||
net = _format_metric_value(str(entry.get("net")), percent=False) if entry.get("net") is not None else ""
|
net = (
|
||||||
io_val = _format_metric_value(str(entry.get("io")), percent=False) if entry.get("io") is not None else ""
|
_format_metric_value(str(entry.get("net")), percent=False, rate=True)
|
||||||
|
if entry.get("net") is not None
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
io_val = (
|
||||||
|
_format_metric_value(str(entry.get("io")), percent=False, rate=True)
|
||||||
|
if entry.get("io") is not None
|
||||||
|
else ""
|
||||||
|
)
|
||||||
lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}")
|
lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}")
|
||||||
|
|
||||||
if nodes_in_query:
|
if nodes_in_query:
|
||||||
@ -1029,7 +1056,7 @@ def snapshot_metric_answer(
|
|||||||
if top:
|
if top:
|
||||||
node, val = top
|
node, val = top
|
||||||
percent = metric in {"cpu", "ram"}
|
percent = metric in {"cpu", "ram"}
|
||||||
value = _format_metric_value(str(val), percent=percent)
|
value = _format_metric_value(str(val), percent=percent, rate=metric in {"net", "io"})
|
||||||
scope = ""
|
scope = ""
|
||||||
if include_hw:
|
if include_hw:
|
||||||
scope = f" among {' and '.join(sorted(include_hw))}"
|
scope = f" among {' and '.join(sorted(include_hw))}"
|
||||||
@ -1051,6 +1078,23 @@ def snapshot_metric_answer(
|
|||||||
if parts:
|
if parts:
|
||||||
return _format_confidence(" ".join(parts), "high")
|
return _format_confidence(" ".join(parts), "high")
|
||||||
|
|
||||||
|
if metric == "pods":
|
||||||
|
running = metrics.get("pods_running")
|
||||||
|
pending = metrics.get("pods_pending")
|
||||||
|
failed = metrics.get("pods_failed")
|
||||||
|
succeeded = metrics.get("pods_succeeded")
|
||||||
|
parts = []
|
||||||
|
if running is not None:
|
||||||
|
parts.append(f"running {running:.0f}")
|
||||||
|
if pending is not None:
|
||||||
|
parts.append(f"pending {pending:.0f}")
|
||||||
|
if failed is not None:
|
||||||
|
parts.append(f"failed {failed:.0f}")
|
||||||
|
if succeeded is not None:
|
||||||
|
parts.append(f"succeeded {succeeded:.0f}")
|
||||||
|
if parts:
|
||||||
|
return _format_confidence(f"Pods: {', '.join(parts)}.", "high")
|
||||||
|
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def structured_answer(
|
def structured_answer(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user