diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index df718e6..55c6da2 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -936,6 +936,28 @@ def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]: per_node.setdefault(node, {})[metric_name] = entry.get("value") return [{"node": node, **vals} for node, vals in sorted(per_node.items())] +def _usage_extremes(usage_table: list[dict[str, Any]]) -> dict[str, tuple[str, float]]: + extremes: dict[str, tuple[str, float]] = {} + for metric in ("cpu", "ram", "net", "io"): + values: list[tuple[str, float]] = [] + for entry in usage_table: + node = entry.get("node") + raw = entry.get(metric) + if not node or raw is None: + continue + try: + value = float(raw) + except (TypeError, ValueError): + continue + values.append((node, value)) + if not values: + continue + lowest = min(values, key=lambda item: item[1]) + highest = max(values, key=lambda item: item[1]) + extremes[f"min_{metric}"] = lowest + extremes[f"max_{metric}"] = highest + return extremes + def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> list[dict[str, Any]]: cleaned: list[dict[str, Any]] = [] for entry in workloads: @@ -1023,6 +1045,13 @@ def facts_context( lines.append(f"- arch {key}: {', '.join(nodes_list)}") if control_plane_nodes: lines.append(f"- control_plane_nodes: {', '.join(control_plane_nodes)}") + control_plane_by_hw: dict[str, list[str]] = collections.defaultdict(list) + for node in inv: + if node.get("name") in control_plane_nodes: + control_plane_by_hw[node.get("hardware") or "unknown"].append(node["name"]) + parts = [f"{hw}={', '.join(sorted(nodes))}" for hw, nodes in sorted(control_plane_by_hw.items())] + if parts: + lines.append(f"- control_plane_by_hardware: {', '.join(parts)}") if worker_nodes: lines.append(f"- worker_nodes: {', '.join(worker_nodes)}") if ready_workers or not_ready_workers: @@ -1068,6 +1097,22 @@ def facts_context( if value is not None: lines.append(f"- {key}: {value}") + top_restarts = metrics.get("top_restarts_1h") if isinstance(metrics.get("top_restarts_1h"), list) else [] + if top_restarts: + items = [] + for entry in top_restarts[:5]: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") or {} + pod = metric.get("pod") or metric.get("name") or "" + ns = metric.get("namespace") or "" + value = entry.get("value") + label = f"{ns}/{pod}".strip("/") + if label and value is not None: + items.append(f"{label}={value}") + if items: + lines.append(f"- top_restarts_1h: {', '.join(items)}") + usage_table = _node_usage_table(metrics) if usage_table: lines.append("- node_usage (cpu/ram/net/io):") @@ -1088,6 +1133,18 @@ def facts_context( else "" ) lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}") + extremes = _usage_extremes(usage_table) + for metric in ("cpu", "ram", "net", "io"): + min_key = f"min_{metric}" + if min_key not in extremes: + continue + node, value = extremes[min_key] + value_fmt = _format_metric_value( + str(value), + percent=metric in ("cpu", "ram"), + rate=metric in ("net", "io"), + ) + lines.append(f"- lowest_{metric}: {node} ({value_fmt})") if nodes_in_query: lines.append("- node_details:") @@ -1112,13 +1169,37 @@ def facts_context( wl = entry.get("workload") or "" primary = entry.get("primary_node") or "" pods_total = entry.get("pods_total") + pods_running = entry.get("pods_running") label = f"{ns}/{wl}" if ns and wl else (wl or ns) if not label: continue if primary: - lines.append(f" - {label}: primary_node={primary}, pods_total={pods_total}") + lines.append( + f" - {label}: primary_node={primary}, pods_total={pods_total}, pods_running={pods_running}" + ) else: - lines.append(f" - {label}: pods_total={pods_total}") + lines.append(f" - {label}: pods_total={pods_total}, pods_running={pods_running}") + top = max( + (entry for entry in workload_entries if isinstance(entry.get("pods_total"), (int, float))), + key=lambda item: item.get("pods_total", 0), + default=None, + ) + if isinstance(top, dict) and top.get("pods_total") is not None: + label = f"{top.get('namespace')}/{top.get('workload')}".strip("/") + lines.append(f"- workload_most_pods: {label} ({top.get('pods_total')})") + zero_running = [ + entry + for entry in workload_entries + if isinstance(entry.get("pods_running"), (int, float)) and entry.get("pods_running") == 0 + ] + if zero_running: + labels = [] + for entry in zero_running: + label = f"{entry.get('namespace')}/{entry.get('workload')}".strip("/") + if label: + labels.append(label) + if labels: + lines.append(f"- workloads_zero_running: {', '.join(labels)}") rendered = "\n".join(lines) return rendered[:MAX_FACTS_CHARS] @@ -2609,15 +2690,15 @@ def _fact_line_tags(line: str) -> set[str]: tags.add("architecture") if any(key in text for key in ("rpi", "jetson", "amd64", "arm64", "non_raspberry_pi")): tags.update({"hardware", "inventory"}) - if "control_plane_nodes" in text or "worker_nodes" in text: + if "control_plane_nodes" in text or "control_plane_by_hardware" in text or "worker_nodes" in text: tags.add("inventory") - if any(key in text for key in ("hottest_", "node_usage", "cpu=", "ram=", "net=", "io=")): + if any(key in text for key in ("hottest_", "lowest_", "node_usage", "cpu=", "ram=", "net=", "io=")): tags.add("utilization") if "postgres_" in text or "postgres connections" in text: tags.add("database") - if "pods_" in text or "pod phases" in text: + if "pods_" in text or "pod phases" in text or "restarts" in text: tags.add("pods") - if "workloads" in text or "primary_node" in text: + if "workloads" in text or "primary_node" in text or "workload_" in text: tags.add("workloads") if "node_details" in text: tags.add("node_detail") @@ -3140,8 +3221,15 @@ def _open_ended_select_facts( selected.append(fid) if len(selected) >= count: break - if not selected: - selected = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count) + seed = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count) + if selected: + for fid in seed: + if fid not in selected: + selected.append(fid) + if len(selected) >= count: + break + else: + selected = seed return selected