diff --git a/atlasbot/snapshot/builder.py b/atlasbot/snapshot/builder.py index 11f5c68..097e91a 100644 --- a/atlasbot/snapshot/builder.py +++ b/atlasbot/snapshot/builder.py @@ -76,6 +76,7 @@ def build_summary(snapshot: dict[str, Any] | None) -> dict[str, Any]: summary.update(_build_nodes(snapshot)) summary.update(_build_pressure(snapshot)) summary.update(_build_hardware(nodes_detail)) + summary.update(_build_hardware_by_node(nodes_detail)) summary.update(_build_node_ages(nodes_detail)) summary.update(_build_node_taints(nodes_detail)) summary.update(_build_capacity(metrics)) @@ -89,6 +90,8 @@ def build_summary(snapshot: dict[str, Any] | None) -> dict[str, Any]: summary.update(_build_postgres(metrics)) summary.update(_build_hottest(metrics)) summary.update(_build_pvc(metrics)) + summary.update(_build_longhorn(snapshot)) + summary.update(_build_root_disk_headroom(metrics)) summary.update(_build_workloads(snapshot)) summary.update(_build_flux(snapshot)) return summary @@ -139,6 +142,18 @@ def _build_hardware(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: return {"hardware": {key: sorted(value) for key, value in hardware.items()}} +def _build_hardware_by_node(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: + mapping: dict[str, str] = {} + for node in nodes_detail or []: + if not isinstance(node, dict): + continue + name = node.get("name") + if isinstance(name, str) and name: + hardware = node.get("hardware") or "unknown" + mapping[name] = str(hardware) + return {"hardware_by_node": mapping} if mapping else {} + + def _build_node_ages(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: ages: list[dict[str, Any]] = [] for node in nodes_detail or []: @@ -174,6 +189,32 @@ def _build_node_taints(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: return {"node_taints": {key: sorted(names) for key, names in taints.items()}} +def _build_root_disk_headroom(metrics: dict[str, Any]) -> dict[str, Any]: + node_usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {} + disk = node_usage.get("disk") if isinstance(node_usage.get("disk"), list) else [] + if not disk: + return {} + entries = [] + for entry in disk: + if not isinstance(entry, dict): + continue + node = entry.get("node") + try: + used_pct = float(entry.get("value")) + except (TypeError, ValueError): + continue + headroom = max(0.0, 100.0 - used_pct) + if node: + entries.append({"node": node, "headroom_pct": headroom, "used_pct": used_pct}) + entries.sort(key=lambda item: (item.get("headroom_pct") or 0.0, item.get("node") or "")) + return {"root_disk_low_headroom": entries[:5]} if entries else {} + + +def _build_longhorn(snapshot: dict[str, Any]) -> dict[str, Any]: + longhorn = snapshot.get("longhorn") + return {"longhorn": longhorn} if isinstance(longhorn, dict) and longhorn else {} + + def _build_pods(metrics: dict[str, Any]) -> dict[str, Any]: pods = { "running": metrics.get("pods_running"), @@ -264,6 +305,7 @@ def _build_postgres(metrics: dict[str, Any]) -> dict[str, Any]: "used": postgres.get("used"), "max": postgres.get("max"), "hottest_db": postgres.get("hottest_db"), + "by_db": postgres.get("by_db"), } } @@ -556,6 +598,9 @@ def _append_pod_issues(lines: list[str], summary: dict[str, Any]) -> None: pending_line = _format_pod_pending_oldest(pod_issues) if pending_line: lines.append(pending_line) + pending_over_line = _format_pod_pending_over_15m(pod_issues) + if pending_over_line: + lines.append(pending_over_line) reasons_line = _format_pod_waiting_reasons(pod_issues) if reasons_line: lines.append(reasons_line) @@ -618,6 +663,17 @@ def _format_pod_waiting_reasons(pod_issues: dict[str, Any]) -> str: return "pod_waiting_reasons: " + "; ".join([f"{key}={val}" for key, val in pairs]) +def _format_pod_pending_over_15m(pod_issues: dict[str, Any]) -> str: + count = pod_issues.get("pending_over_15m") + if count is None: + return "" + try: + count_val = int(count) + except (TypeError, ValueError): + return "" + return f"pods_pending_over_15m: {count_val}" + + def _append_workload_health(lines: list[str], summary: dict[str, Any]) -> None: health = summary.get("workloads_health") if isinstance(summary.get("workloads_health"), dict) else {} if not health: @@ -689,6 +745,59 @@ def _append_pvc_usage(lines: list[str], summary: dict[str, Any]) -> None: lines.append("pvc_usage_top: " + "; ".join(parts)) +def _append_root_disk_headroom(lines: list[str], summary: dict[str, Any]) -> None: + headroom = summary.get("root_disk_low_headroom") + if not isinstance(headroom, list) or not headroom: + return + parts = [] + for entry in headroom: + if not isinstance(entry, dict): + continue + node = entry.get("node") + headroom_pct = entry.get("headroom_pct") + if node and headroom_pct is not None: + parts.append(f"{node}={_format_float(headroom_pct)}%") + if parts: + lines.append("root_disk_low_headroom: " + "; ".join(parts)) + + +def _append_longhorn(lines: list[str], summary: dict[str, Any]) -> None: + longhorn = summary.get("longhorn") if isinstance(summary.get("longhorn"), dict) else {} + if not longhorn: + return + total = longhorn.get("total") + unhealthy = longhorn.get("unhealthy_count") + by_state = longhorn.get("by_state") if isinstance(longhorn.get("by_state"), dict) else {} + by_robust = longhorn.get("by_robustness") if isinstance(longhorn.get("by_robustness"), dict) else {} + if total is not None: + lines.append( + "longhorn: total={total}, unhealthy={unhealthy}".format( + total=total, + unhealthy=unhealthy if unhealthy is not None else 0, + ) + ) + if by_state: + lines.append("longhorn_state: " + _format_kv_map(by_state)) + if by_robust: + lines.append("longhorn_robustness: " + _format_kv_map(by_robust)) + unhealthy_items = longhorn.get("unhealthy") + if isinstance(unhealthy_items, list) and unhealthy_items: + parts = [] + for entry in unhealthy_items[:5]: + if not isinstance(entry, dict): + continue + name = entry.get("name") + state = entry.get("state") + robustness = entry.get("robustness") + if name: + label = name + if state or robustness: + label = f"{label}({state},{robustness})" + parts.append(label) + if parts: + lines.append("longhorn_unhealthy_top: " + "; ".join(parts)) + + def _append_namespace_usage(lines: list[str], summary: dict[str, Any]) -> None: metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} cpu_top = metrics.get("namespace_cpu_top") if isinstance(metrics.get("namespace_cpu_top"), list) else [] @@ -799,7 +908,7 @@ def _append_restarts(lines: list[str], summary: dict[str, Any]) -> None: metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} top_restarts = metrics.get("top_restarts_1h") or [] if not isinstance(top_restarts, list) or not top_restarts: - return + top_restarts = [] parts = [] for entry in top_restarts: metric = entry.get("metric") if isinstance(entry, dict) else {} @@ -813,6 +922,17 @@ def _append_restarts(lines: list[str], summary: dict[str, Any]) -> None: parts.append(f"{namespace}/{pod}={count}") if parts: lines.append("restarts_1h_top: " + "; ".join(parts)) + ns_top = metrics.get("restart_namespace_top") or [] + if isinstance(ns_top, list) and ns_top: + ns_parts = [] + for entry in ns_top: + metric = entry.get("metric") if isinstance(entry, dict) else {} + value = entry.get("value") + namespace = metric.get("namespace") if isinstance(metric, dict) else None + if namespace and value is not None: + ns_parts.append(f"{namespace}={_format_float(value)}") + if ns_parts: + lines.append("restarts_1h_namespace_top: " + "; ".join(ns_parts)) def _append_job_failures(lines: list[str], summary: dict[str, Any]) -> None: @@ -907,23 +1027,41 @@ def _append_postgres(lines: list[str], summary: dict[str, Any]) -> None: hottest=hottest, ) ) + by_db = postgres.get("by_db") + if isinstance(by_db, list) and by_db: + parts = [] + for entry in by_db: + metric = entry.get("metric") if isinstance(entry, dict) else {} + value = entry.get("value") + name = metric.get("datname") if isinstance(metric, dict) else None + if name and value is not None: + parts.append(f"{name}={_format_float(value)}") + if parts: + lines.append("postgres_connections_by_db: " + "; ".join(parts)) def _append_hottest(lines: list[str], summary: dict[str, Any]) -> None: hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {} if not hottest: return + hardware_map = summary.get("hardware_by_node") + if not isinstance(hardware_map, dict): + hardware_map = {} parts = [] for key, entry in hottest.items(): if not isinstance(entry, dict): continue node = entry.get("node") + hardware = hardware_map.get(node) if node else None if key in {"net", "io"}: value = _format_rate_bytes(entry.get("value")) else: value = _format_float(entry.get("value")) if node: - parts.append(f"{key}={node} ({value})") + label = node + if hardware: + label = f"{label} [{hardware}]" + parts.append(f"{key}={label} ({value})") if parts: lines.append("hottest: " + "; ".join(parts)) @@ -1006,6 +1144,8 @@ def summary_text(snapshot: dict[str, Any] | None) -> str: _append_postgres(lines, summary) _append_hottest(lines, summary) _append_pvc_usage(lines, summary) + _append_root_disk_headroom(lines, summary) + _append_longhorn(lines, summary) _append_workloads(lines, summary) _append_flux(lines, summary) _append_units_windows(lines, summary)