snapshot: add longhorn and disk headroom hints

This commit is contained in:
Brad Stein 2026-01-29 07:44:01 -03:00
parent e0c5f0a2f6
commit 26c6cea40b

View File

@ -76,6 +76,7 @@ def build_summary(snapshot: dict[str, Any] | None) -> dict[str, Any]:
summary.update(_build_nodes(snapshot)) summary.update(_build_nodes(snapshot))
summary.update(_build_pressure(snapshot)) summary.update(_build_pressure(snapshot))
summary.update(_build_hardware(nodes_detail)) summary.update(_build_hardware(nodes_detail))
summary.update(_build_hardware_by_node(nodes_detail))
summary.update(_build_node_ages(nodes_detail)) summary.update(_build_node_ages(nodes_detail))
summary.update(_build_node_taints(nodes_detail)) summary.update(_build_node_taints(nodes_detail))
summary.update(_build_capacity(metrics)) summary.update(_build_capacity(metrics))
@ -89,6 +90,8 @@ def build_summary(snapshot: dict[str, Any] | None) -> dict[str, Any]:
summary.update(_build_postgres(metrics)) summary.update(_build_postgres(metrics))
summary.update(_build_hottest(metrics)) summary.update(_build_hottest(metrics))
summary.update(_build_pvc(metrics)) summary.update(_build_pvc(metrics))
summary.update(_build_longhorn(snapshot))
summary.update(_build_root_disk_headroom(metrics))
summary.update(_build_workloads(snapshot)) summary.update(_build_workloads(snapshot))
summary.update(_build_flux(snapshot)) summary.update(_build_flux(snapshot))
return summary return summary
@ -139,6 +142,18 @@ def _build_hardware(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
return {"hardware": {key: sorted(value) for key, value in hardware.items()}} return {"hardware": {key: sorted(value) for key, value in hardware.items()}}
def _build_hardware_by_node(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
mapping: dict[str, str] = {}
for node in nodes_detail or []:
if not isinstance(node, dict):
continue
name = node.get("name")
if isinstance(name, str) and name:
hardware = node.get("hardware") or "unknown"
mapping[name] = str(hardware)
return {"hardware_by_node": mapping} if mapping else {}
def _build_node_ages(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]: def _build_node_ages(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
ages: list[dict[str, Any]] = [] ages: list[dict[str, Any]] = []
for node in nodes_detail or []: for node in nodes_detail or []:
@ -174,6 +189,32 @@ def _build_node_taints(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
return {"node_taints": {key: sorted(names) for key, names in taints.items()}} return {"node_taints": {key: sorted(names) for key, names in taints.items()}}
def _build_root_disk_headroom(metrics: dict[str, Any]) -> dict[str, Any]:
node_usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
disk = node_usage.get("disk") if isinstance(node_usage.get("disk"), list) else []
if not disk:
return {}
entries = []
for entry in disk:
if not isinstance(entry, dict):
continue
node = entry.get("node")
try:
used_pct = float(entry.get("value"))
except (TypeError, ValueError):
continue
headroom = max(0.0, 100.0 - used_pct)
if node:
entries.append({"node": node, "headroom_pct": headroom, "used_pct": used_pct})
entries.sort(key=lambda item: (item.get("headroom_pct") or 0.0, item.get("node") or ""))
return {"root_disk_low_headroom": entries[:5]} if entries else {}
def _build_longhorn(snapshot: dict[str, Any]) -> dict[str, Any]:
longhorn = snapshot.get("longhorn")
return {"longhorn": longhorn} if isinstance(longhorn, dict) and longhorn else {}
def _build_pods(metrics: dict[str, Any]) -> dict[str, Any]: def _build_pods(metrics: dict[str, Any]) -> dict[str, Any]:
pods = { pods = {
"running": metrics.get("pods_running"), "running": metrics.get("pods_running"),
@ -264,6 +305,7 @@ def _build_postgres(metrics: dict[str, Any]) -> dict[str, Any]:
"used": postgres.get("used"), "used": postgres.get("used"),
"max": postgres.get("max"), "max": postgres.get("max"),
"hottest_db": postgres.get("hottest_db"), "hottest_db": postgres.get("hottest_db"),
"by_db": postgres.get("by_db"),
} }
} }
@ -556,6 +598,9 @@ def _append_pod_issues(lines: list[str], summary: dict[str, Any]) -> None:
pending_line = _format_pod_pending_oldest(pod_issues) pending_line = _format_pod_pending_oldest(pod_issues)
if pending_line: if pending_line:
lines.append(pending_line) lines.append(pending_line)
pending_over_line = _format_pod_pending_over_15m(pod_issues)
if pending_over_line:
lines.append(pending_over_line)
reasons_line = _format_pod_waiting_reasons(pod_issues) reasons_line = _format_pod_waiting_reasons(pod_issues)
if reasons_line: if reasons_line:
lines.append(reasons_line) lines.append(reasons_line)
@ -618,6 +663,17 @@ def _format_pod_waiting_reasons(pod_issues: dict[str, Any]) -> str:
return "pod_waiting_reasons: " + "; ".join([f"{key}={val}" for key, val in pairs]) return "pod_waiting_reasons: " + "; ".join([f"{key}={val}" for key, val in pairs])
def _format_pod_pending_over_15m(pod_issues: dict[str, Any]) -> str:
count = pod_issues.get("pending_over_15m")
if count is None:
return ""
try:
count_val = int(count)
except (TypeError, ValueError):
return ""
return f"pods_pending_over_15m: {count_val}"
def _append_workload_health(lines: list[str], summary: dict[str, Any]) -> None: def _append_workload_health(lines: list[str], summary: dict[str, Any]) -> None:
health = summary.get("workloads_health") if isinstance(summary.get("workloads_health"), dict) else {} health = summary.get("workloads_health") if isinstance(summary.get("workloads_health"), dict) else {}
if not health: if not health:
@ -689,6 +745,59 @@ def _append_pvc_usage(lines: list[str], summary: dict[str, Any]) -> None:
lines.append("pvc_usage_top: " + "; ".join(parts)) lines.append("pvc_usage_top: " + "; ".join(parts))
def _append_root_disk_headroom(lines: list[str], summary: dict[str, Any]) -> None:
headroom = summary.get("root_disk_low_headroom")
if not isinstance(headroom, list) or not headroom:
return
parts = []
for entry in headroom:
if not isinstance(entry, dict):
continue
node = entry.get("node")
headroom_pct = entry.get("headroom_pct")
if node and headroom_pct is not None:
parts.append(f"{node}={_format_float(headroom_pct)}%")
if parts:
lines.append("root_disk_low_headroom: " + "; ".join(parts))
def _append_longhorn(lines: list[str], summary: dict[str, Any]) -> None:
longhorn = summary.get("longhorn") if isinstance(summary.get("longhorn"), dict) else {}
if not longhorn:
return
total = longhorn.get("total")
unhealthy = longhorn.get("unhealthy_count")
by_state = longhorn.get("by_state") if isinstance(longhorn.get("by_state"), dict) else {}
by_robust = longhorn.get("by_robustness") if isinstance(longhorn.get("by_robustness"), dict) else {}
if total is not None:
lines.append(
"longhorn: total={total}, unhealthy={unhealthy}".format(
total=total,
unhealthy=unhealthy if unhealthy is not None else 0,
)
)
if by_state:
lines.append("longhorn_state: " + _format_kv_map(by_state))
if by_robust:
lines.append("longhorn_robustness: " + _format_kv_map(by_robust))
unhealthy_items = longhorn.get("unhealthy")
if isinstance(unhealthy_items, list) and unhealthy_items:
parts = []
for entry in unhealthy_items[:5]:
if not isinstance(entry, dict):
continue
name = entry.get("name")
state = entry.get("state")
robustness = entry.get("robustness")
if name:
label = name
if state or robustness:
label = f"{label}({state},{robustness})"
parts.append(label)
if parts:
lines.append("longhorn_unhealthy_top: " + "; ".join(parts))
def _append_namespace_usage(lines: list[str], summary: dict[str, Any]) -> None: def _append_namespace_usage(lines: list[str], summary: dict[str, Any]) -> None:
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
cpu_top = metrics.get("namespace_cpu_top") if isinstance(metrics.get("namespace_cpu_top"), list) else [] cpu_top = metrics.get("namespace_cpu_top") if isinstance(metrics.get("namespace_cpu_top"), list) else []
@ -799,7 +908,7 @@ def _append_restarts(lines: list[str], summary: dict[str, Any]) -> None:
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {} metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
top_restarts = metrics.get("top_restarts_1h") or [] top_restarts = metrics.get("top_restarts_1h") or []
if not isinstance(top_restarts, list) or not top_restarts: if not isinstance(top_restarts, list) or not top_restarts:
return top_restarts = []
parts = [] parts = []
for entry in top_restarts: for entry in top_restarts:
metric = entry.get("metric") if isinstance(entry, dict) else {} metric = entry.get("metric") if isinstance(entry, dict) else {}
@ -813,6 +922,17 @@ def _append_restarts(lines: list[str], summary: dict[str, Any]) -> None:
parts.append(f"{namespace}/{pod}={count}") parts.append(f"{namespace}/{pod}={count}")
if parts: if parts:
lines.append("restarts_1h_top: " + "; ".join(parts)) lines.append("restarts_1h_top: " + "; ".join(parts))
ns_top = metrics.get("restart_namespace_top") or []
if isinstance(ns_top, list) and ns_top:
ns_parts = []
for entry in ns_top:
metric = entry.get("metric") if isinstance(entry, dict) else {}
value = entry.get("value")
namespace = metric.get("namespace") if isinstance(metric, dict) else None
if namespace and value is not None:
ns_parts.append(f"{namespace}={_format_float(value)}")
if ns_parts:
lines.append("restarts_1h_namespace_top: " + "; ".join(ns_parts))
def _append_job_failures(lines: list[str], summary: dict[str, Any]) -> None: def _append_job_failures(lines: list[str], summary: dict[str, Any]) -> None:
@ -907,23 +1027,41 @@ def _append_postgres(lines: list[str], summary: dict[str, Any]) -> None:
hottest=hottest, hottest=hottest,
) )
) )
by_db = postgres.get("by_db")
if isinstance(by_db, list) and by_db:
parts = []
for entry in by_db:
metric = entry.get("metric") if isinstance(entry, dict) else {}
value = entry.get("value")
name = metric.get("datname") if isinstance(metric, dict) else None
if name and value is not None:
parts.append(f"{name}={_format_float(value)}")
if parts:
lines.append("postgres_connections_by_db: " + "; ".join(parts))
def _append_hottest(lines: list[str], summary: dict[str, Any]) -> None: def _append_hottest(lines: list[str], summary: dict[str, Any]) -> None:
hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {} hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {}
if not hottest: if not hottest:
return return
hardware_map = summary.get("hardware_by_node")
if not isinstance(hardware_map, dict):
hardware_map = {}
parts = [] parts = []
for key, entry in hottest.items(): for key, entry in hottest.items():
if not isinstance(entry, dict): if not isinstance(entry, dict):
continue continue
node = entry.get("node") node = entry.get("node")
hardware = hardware_map.get(node) if node else None
if key in {"net", "io"}: if key in {"net", "io"}:
value = _format_rate_bytes(entry.get("value")) value = _format_rate_bytes(entry.get("value"))
else: else:
value = _format_float(entry.get("value")) value = _format_float(entry.get("value"))
if node: if node:
parts.append(f"{key}={node} ({value})") label = node
if hardware:
label = f"{label} [{hardware}]"
parts.append(f"{key}={label} ({value})")
if parts: if parts:
lines.append("hottest: " + "; ".join(parts)) lines.append("hottest: " + "; ".join(parts))
@ -1006,6 +1144,8 @@ def summary_text(snapshot: dict[str, Any] | None) -> str:
_append_postgres(lines, summary) _append_postgres(lines, summary)
_append_hottest(lines, summary) _append_hottest(lines, summary)
_append_pvc_usage(lines, summary) _append_pvc_usage(lines, summary)
_append_root_disk_headroom(lines, summary)
_append_longhorn(lines, summary)
_append_workloads(lines, summary) _append_workloads(lines, summary)
_append_flux(lines, summary) _append_flux(lines, summary)
_append_units_windows(lines, summary) _append_units_windows(lines, summary)