atlasbot: enrich fact pack and selection

2026-01-28 01:02:14 -03:00 · 2026-01-28 01:02:14 -03:00 · 474c472b1d
commit 474c472b1d
parent 6578a8b08a
1 changed files with 96 additions and 8 deletions
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -936,6 +936,28 @@ def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]:
            per_node.setdefault(node, {})[metric_name] = entry.get("value")
    return [{"node": node, **vals} for node, vals in sorted(per_node.items())]

+def _usage_extremes(usage_table: list[dict[str, Any]]) -> dict[str, tuple[str, float]]:
+    extremes: dict[str, tuple[str, float]] = {}
+    for metric in ("cpu", "ram", "net", "io"):
+        values: list[tuple[str, float]] = []
+        for entry in usage_table:
+            node = entry.get("node")
+            raw = entry.get(metric)
+            if not node or raw is None:
+                continue
+            try:
+                value = float(raw)
+            except (TypeError, ValueError):
+                continue
+            values.append((node, value))
+        if not values:
+            continue
+        lowest = min(values, key=lambda item: item[1])
+        highest = max(values, key=lambda item: item[1])
+        extremes[f"min_{metric}"] = lowest
+        extremes[f"max_{metric}"] = highest
+    return extremes
+
 def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> list[dict[str, Any]]:
    cleaned: list[dict[str, Any]] = []
    for entry in workloads:
@ -1023,6 +1045,13 @@ def facts_context(
            lines.append(f"- arch {key}: {', '.join(nodes_list)}")
    if control_plane_nodes:
        lines.append(f"- control_plane_nodes: {', '.join(control_plane_nodes)}")
+        control_plane_by_hw: dict[str, list[str]] = collections.defaultdict(list)
+        for node in inv:
+            if node.get("name") in control_plane_nodes:
+                control_plane_by_hw[node.get("hardware") or "unknown"].append(node["name"])
+        parts = [f"{hw}={', '.join(sorted(nodes))}" for hw, nodes in sorted(control_plane_by_hw.items())]
+        if parts:
+            lines.append(f"- control_plane_by_hardware: {', '.join(parts)}")
    if worker_nodes:
        lines.append(f"- worker_nodes: {', '.join(worker_nodes)}")
    if ready_workers or not_ready_workers:
@ -1068,6 +1097,22 @@ def facts_context(
        if value is not None:
            lines.append(f"- {key}: {value}")

+    top_restarts = metrics.get("top_restarts_1h") if isinstance(metrics.get("top_restarts_1h"), list) else []
+    if top_restarts:
+        items = []
+        for entry in top_restarts[:5]:
+            if not isinstance(entry, dict):
+                continue
+            metric = entry.get("metric") or {}
+            pod = metric.get("pod") or metric.get("name") or ""
+            ns = metric.get("namespace") or ""
+            value = entry.get("value")
+            label = f"{ns}/{pod}".strip("/")
+            if label and value is not None:
+                items.append(f"{label}={value}")
+        if items:
+            lines.append(f"- top_restarts_1h: {', '.join(items)}")
+
    usage_table = _node_usage_table(metrics)
    if usage_table:
        lines.append("- node_usage (cpu/ram/net/io):")
@ -1088,6 +1133,18 @@ def facts_context(
                else ""
            )
            lines.append(f"  - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}")
+        extremes = _usage_extremes(usage_table)
+        for metric in ("cpu", "ram", "net", "io"):
+            min_key = f"min_{metric}"
+            if min_key not in extremes:
+                continue
+            node, value = extremes[min_key]
+            value_fmt = _format_metric_value(
+                str(value),
+                percent=metric in ("cpu", "ram"),
+                rate=metric in ("net", "io"),
+            )
+            lines.append(f"- lowest_{metric}: {node} ({value_fmt})")

    if nodes_in_query:
        lines.append("- node_details:")
@ -1112,13 +1169,37 @@ def facts_context(
            wl = entry.get("workload") or ""
            primary = entry.get("primary_node") or ""
            pods_total = entry.get("pods_total")
+            pods_running = entry.get("pods_running")
            label = f"{ns}/{wl}" if ns and wl else (wl or ns)
            if not label:
                continue
            if primary:
-                lines.append(f"  - {label}: primary_node={primary}, pods_total={pods_total}")
+                lines.append(
+                    f"  - {label}: primary_node={primary}, pods_total={pods_total}, pods_running={pods_running}"
+                )
            else:
-                lines.append(f"  - {label}: pods_total={pods_total}")
+                lines.append(f"  - {label}: pods_total={pods_total}, pods_running={pods_running}")
+        top = max(
+            (entry for entry in workload_entries if isinstance(entry.get("pods_total"), (int, float))),
+            key=lambda item: item.get("pods_total", 0),
+            default=None,
+        )
+        if isinstance(top, dict) and top.get("pods_total") is not None:
+            label = f"{top.get('namespace')}/{top.get('workload')}".strip("/")
+            lines.append(f"- workload_most_pods: {label} ({top.get('pods_total')})")
+        zero_running = [
+            entry
+            for entry in workload_entries
+            if isinstance(entry.get("pods_running"), (int, float)) and entry.get("pods_running") == 0
+        ]
+        if zero_running:
+            labels = []
+            for entry in zero_running:
+                label = f"{entry.get('namespace')}/{entry.get('workload')}".strip("/")
+                if label:
+                    labels.append(label)
+            if labels:
+                lines.append(f"- workloads_zero_running: {', '.join(labels)}")

    rendered = "\n".join(lines)
    return rendered[:MAX_FACTS_CHARS]
@ -2609,15 +2690,15 @@ def _fact_line_tags(line: str) -> set[str]:
        tags.add("architecture")
    if any(key in text for key in ("rpi", "jetson", "amd64", "arm64", "non_raspberry_pi")):
        tags.update({"hardware", "inventory"})
-    if "control_plane_nodes" in text or "worker_nodes" in text:
+    if "control_plane_nodes" in text or "control_plane_by_hardware" in text or "worker_nodes" in text:
        tags.add("inventory")
-    if any(key in text for key in ("hottest_", "node_usage", "cpu=", "ram=", "net=", "io=")):
+    if any(key in text for key in ("hottest_", "lowest_", "node_usage", "cpu=", "ram=", "net=", "io=")):
        tags.add("utilization")
    if "postgres_" in text or "postgres connections" in text:
        tags.add("database")
-    if "pods_" in text or "pod phases" in text:
+    if "pods_" in text or "pod phases" in text or "restarts" in text:
        tags.add("pods")
-    if "workloads" in text or "primary_node" in text:
+    if "workloads" in text or "primary_node" in text or "workload_" in text:
        tags.add("workloads")
    if "node_details" in text:
        tags.add("node_detail")
@ -3140,8 +3221,15 @@ def _open_ended_select_facts(
                selected.append(fid)
            if len(selected) >= count:
                break
-    if not selected:
-        selected = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count)
+    seed = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count)
+    if selected:
+        for fid in seed:
+            if fid not in selected:
+                selected.append(fid)
+            if len(selected) >= count:
+                break
+    else:
+        selected = seed
    return selected