From bdebddada43522c403389638abef052eeced21aa Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 29 Jan 2026 01:53:29 -0300 Subject: [PATCH] snapshot: add pressure capacity and pod issues --- atlasbot/snapshot/builder.py | 113 +++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/atlasbot/snapshot/builder.py b/atlasbot/snapshot/builder.py index 0e3f7d2..2a82833 100644 --- a/atlasbot/snapshot/builder.py +++ b/atlasbot/snapshot/builder.py @@ -72,11 +72,14 @@ def build_summary(snapshot: dict[str, Any] | None) -> dict[str, Any]: if metrics: summary["metrics"] = metrics summary.update(_build_nodes(snapshot)) + summary.update(_build_pressure(snapshot)) summary.update(_build_hardware(nodes_detail)) + summary.update(_build_capacity(metrics)) summary.update(_build_pods(metrics)) summary.update(_build_namespace_pods(snapshot)) summary.update(_build_namespace_nodes(snapshot)) summary.update(_build_node_pods(snapshot)) + summary.update(_build_pod_issues(snapshot)) summary.update(_build_postgres(metrics)) summary.update(_build_hottest(metrics)) summary.update(_build_workloads(snapshot)) @@ -107,6 +110,14 @@ def _build_nodes(snapshot: dict[str, Any]) -> dict[str, Any]: } +def _build_pressure(snapshot: dict[str, Any]) -> dict[str, Any]: + nodes_summary = snapshot.get("nodes_summary") if isinstance(snapshot.get("nodes_summary"), dict) else {} + pressure = nodes_summary.get("pressure_nodes") if isinstance(nodes_summary.get("pressure_nodes"), dict) else {} + if not pressure: + return {} + return {"pressure_nodes": pressure} + + def _build_hardware(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: hardware: dict[str, list[str]] = {} for node in nodes_detail or []: @@ -133,6 +144,22 @@ def _build_pods(metrics: dict[str, Any]) -> dict[str, Any]: return {"pods": pods} +def _build_capacity(metrics: dict[str, Any]) -> dict[str, Any]: + if not metrics: + return {} + capacity = { + "cpu": metrics.get("capacity_cpu"), + "allocatable_cpu": metrics.get("allocatable_cpu"), + "mem_bytes": metrics.get("capacity_mem_bytes"), + "allocatable_mem_bytes": metrics.get("allocatable_mem_bytes"), + "pods": metrics.get("capacity_pods"), + "allocatable_pods": metrics.get("allocatable_pods"), + } + if not any(value is not None for value in capacity.values()): + return {} + return {"capacity": capacity} + + def _build_namespace_pods(snapshot: dict[str, Any]) -> dict[str, Any]: namespaces = snapshot.get("namespace_pods") if not isinstance(namespaces, list) or not namespaces: @@ -154,6 +181,13 @@ def _build_node_pods(snapshot: dict[str, Any]) -> dict[str, Any]: return {"node_pods": node_pods} +def _build_pod_issues(snapshot: dict[str, Any]) -> dict[str, Any]: + pod_issues = snapshot.get("pod_issues") + if not isinstance(pod_issues, dict) or not pod_issues: + return {} + return {"pod_issues": pod_issues} + + def _build_postgres(metrics: dict[str, Any]) -> dict[str, Any]: postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} if not postgres: @@ -286,6 +320,20 @@ def _append_hardware(lines: list[str], summary: dict[str, Any]) -> None: lines.append("hardware: " + "; ".join(sorted(parts))) +def _append_pressure(lines: list[str], summary: dict[str, Any]) -> None: + pressure = summary.get("pressure_nodes") + if not isinstance(pressure, dict) or not pressure: + return + parts = [] + for cond, nodes in sorted(pressure.items()): + if not nodes: + continue + name_list = _format_names([str(name) for name in nodes if name]) + parts.append(f"{cond}={len(nodes)} ({name_list})" if name_list else f"{cond}={len(nodes)}") + if parts: + lines.append("node_pressure: " + "; ".join(parts)) + + def _append_pods(lines: list[str], summary: dict[str, Any]) -> None: pods = summary.get("pods") if isinstance(summary.get("pods"), dict) else {} if not pods: @@ -300,6 +348,27 @@ def _append_pods(lines: list[str], summary: dict[str, Any]) -> None: ) +def _append_capacity(lines: list[str], summary: dict[str, Any]) -> None: + capacity = summary.get("capacity") if isinstance(summary.get("capacity"), dict) else {} + if not capacity: + return + parts = [] + if capacity.get("cpu") is not None: + parts.append(f"cpu={_format_float(capacity.get('cpu'))}") + if capacity.get("allocatable_cpu") is not None: + parts.append(f"alloc_cpu={_format_float(capacity.get('allocatable_cpu'))}") + if capacity.get("mem_bytes") is not None: + parts.append(f"mem={_format_bytes(capacity.get('mem_bytes'))}") + if capacity.get("allocatable_mem_bytes") is not None: + parts.append(f"alloc_mem={_format_bytes(capacity.get('allocatable_mem_bytes'))}") + if capacity.get("pods") is not None: + parts.append(f"pods={_format_float(capacity.get('pods'))}") + if capacity.get("allocatable_pods") is not None: + parts.append(f"alloc_pods={_format_float(capacity.get('allocatable_pods'))}") + if parts: + lines.append("capacity: " + "; ".join(parts)) + + def _append_namespace_pods(lines: list[str], summary: dict[str, Any]) -> None: namespaces = summary.get("namespace_pods") if not isinstance(namespaces, list) or not namespaces: @@ -370,6 +439,47 @@ def _append_node_pods(lines: list[str], summary: dict[str, Any]) -> None: lines.append("node_pods_top: " + "; ".join(parts)) +def _append_pod_issues(lines: list[str], summary: dict[str, Any]) -> None: + pod_issues = summary.get("pod_issues") if isinstance(summary.get("pod_issues"), dict) else {} + if not pod_issues: + return + counts_line = _format_pod_issue_counts(pod_issues) + if counts_line: + lines.append(counts_line) + top_line = _format_pod_issue_top(pod_issues) + if top_line: + lines.append(top_line) + + +def _format_pod_issue_counts(pod_issues: dict[str, Any]) -> str: + counts = pod_issues.get("counts") if isinstance(pod_issues.get("counts"), dict) else {} + if not counts: + return "" + parts = [] + for key in ("Failed", "Pending", "Unknown"): + if key in counts: + parts.append(f"{key}={counts.get(key)}") + return "pod_issues: " + "; ".join(parts) if parts else "" + + +def _format_pod_issue_top(pod_issues: dict[str, Any]) -> str: + items = pod_issues.get("items") if isinstance(pod_issues.get("items"), list) else [] + if not items: + return "" + top = [] + for item in items[:5]: + if not isinstance(item, dict): + continue + namespace = item.get("namespace") + pod = item.get("pod") + if not namespace or not pod: + continue + phase = item.get("phase") or "" + restarts = item.get("restarts") or 0 + top.append(f"{namespace}/{pod}({phase},r={restarts})") + return "pod_issues_top: " + "; ".join(top) if top else "" + + def _append_node_usage_stats(lines: list[str], summary: dict[str, Any]) -> None: metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} stats = metrics.get("node_usage_stats") if isinstance(metrics.get("node_usage_stats"), dict) else {} @@ -524,11 +634,14 @@ def summary_text(snapshot: dict[str, Any] | None) -> str: return "" lines: list[str] = [] _append_nodes(lines, summary) + _append_pressure(lines, summary) _append_hardware(lines, summary) + _append_capacity(lines, summary) _append_pods(lines, summary) _append_namespace_pods(lines, summary) _append_namespace_nodes(lines, summary) _append_node_pods(lines, summary) + _append_pod_issues(lines, summary) _append_node_usage_stats(lines, summary) _append_namespace_usage(lines, summary) _append_restarts(lines, summary)