snapshot: add longhorn volume summary

This commit is contained in:
Brad Stein 2026-01-29 07:43:37 -03:00
parent c9708a83ea
commit a6062be60e

View File

@ -61,6 +61,7 @@ _PHASE_SEVERITY = {
"Pending": 2,
"Unknown": 1,
}
_PENDING_15M_HOURS = 0.25
@dataclass(frozen=True)
@ -711,13 +712,21 @@ def _node_pod_finalize(nodes: dict[str, dict[str, Any]]) -> list[dict[str, Any]]
return output
def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
items: list[dict[str, Any]] = []
counts: dict[str, int] = {key: 0 for key in _PHASE_SEVERITY}
pending_oldest: list[dict[str, Any]] = []
waiting_reason_counts: dict[str, int] = {}
phase_reason_counts: dict[str, int] = {}
for pod in _items(payload):
def _record_pending_pod(
pending_oldest: list[dict[str, Any]],
info: dict[str, Any],
) -> bool:
age_hours = info.get("age_hours")
if age_hours is None:
return False
pending_oldest.append(info)
return age_hours >= _PENDING_15M_HOURS
def _update_pod_issue(
pod: dict[str, Any],
acc: dict[str, Any],
) -> None:
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
@ -730,7 +739,7 @@ def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
)
age_hours = _age_hours(created_at)
if not name or not namespace:
continue
return
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
restarts = 0
waiting_reasons: list[str] = []
@ -743,14 +752,14 @@ def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
reason = waiting.get("reason")
if isinstance(reason, str) and reason:
waiting_reasons.append(reason)
waiting_reason_counts[reason] = waiting_reason_counts.get(reason, 0) + 1
acc["waiting_reasons"][reason] = acc["waiting_reasons"].get(reason, 0) + 1
phase_reason = status.get("reason")
if isinstance(phase_reason, str) and phase_reason:
phase_reason_counts[phase_reason] = phase_reason_counts.get(phase_reason, 0) + 1
if phase in counts:
counts[phase] += 1
acc["phase_reasons"][phase_reason] = acc["phase_reasons"].get(phase_reason, 0) + 1
if phase in acc["counts"]:
acc["counts"][phase] += 1
if phase in _PHASE_SEVERITY or restarts > 0:
items.append(
acc["items"].append(
{
"namespace": namespace,
"pod": name,
@ -763,16 +772,31 @@ def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
"age_hours": age_hours,
}
)
if phase == "Pending" and age_hours is not None:
pending_oldest.append(
{
if phase == "Pending":
info = {
"namespace": namespace,
"pod": name,
"node": spec.get("nodeName") or "",
"age_hours": age_hours,
"reason": status.get("reason") or "",
}
)
if _record_pending_pod(acc["pending_oldest"], info):
acc["pending_over_15m"] += 1
def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
acc = {
"items": [],
"counts": {key: 0 for key in _PHASE_SEVERITY},
"pending_oldest": [],
"pending_over_15m": 0,
"waiting_reasons": {},
"phase_reasons": {},
}
for pod in _items(payload):
if isinstance(pod, dict):
_update_pod_issue(pod, acc)
items = acc["items"]
items.sort(
key=lambda item: (
-_PHASE_SEVERITY.get(item.get("phase") or "", 0),
@ -781,13 +805,15 @@ def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
item.get("pod") or "",
)
)
pending_oldest = acc["pending_oldest"]
pending_oldest.sort(key=lambda item: -(item.get("age_hours") or 0.0))
return {
"counts": counts,
"counts": acc["counts"],
"items": items[:20],
"pending_oldest": pending_oldest[:10],
"waiting_reasons": waiting_reason_counts,
"phase_reasons": phase_reason_counts,
"pending_over_15m": acc["pending_over_15m"],
"waiting_reasons": acc["waiting_reasons"],
"phase_reasons": acc["phase_reasons"],
}
@ -1036,6 +1062,57 @@ def _fetch_jobs(errors: list[str]) -> dict[str, Any]:
return {}
def _summarize_longhorn_volumes(payload: dict[str, Any]) -> dict[str, Any]:
items = _items(payload)
if not items:
return {}
by_state: dict[str, int] = {}
by_robustness: dict[str, int] = {}
unhealthy: list[dict[str, Any]] = []
for volume in items:
metadata = volume.get("metadata") if isinstance(volume.get("metadata"), dict) else {}
status = volume.get("status") if isinstance(volume.get("status"), dict) else {}
spec = volume.get("spec") if isinstance(volume.get("spec"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
if not name:
continue
state = status.get("state") if isinstance(status.get("state"), str) else "unknown"
robustness = (
status.get("robustness") if isinstance(status.get("robustness"), str) else "unknown"
)
by_state[state] = by_state.get(state, 0) + 1
by_robustness[robustness] = by_robustness.get(robustness, 0) + 1
if state.lower() != "attached" or robustness.lower() != "healthy":
unhealthy.append(
{
"name": name,
"state": state,
"robustness": robustness,
"size": spec.get("size"),
"actual_size": status.get("actualSize"),
}
)
unhealthy.sort(key=lambda item: item.get("name") or "")
return {
"total": len(items),
"by_state": by_state,
"by_robustness": by_robustness,
"unhealthy": unhealthy,
"unhealthy_count": len(unhealthy),
}
def _fetch_longhorn(errors: list[str]) -> dict[str, Any]:
try:
payload = get_json(
"/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes"
)
return _summarize_longhorn_volumes(payload)
except Exception as exc:
errors.append(f"longhorn: {exc}")
return {}
def _fetch_workload_health(errors: list[str]) -> dict[str, Any]:
try:
deployments_payload = get_json("/apis/apps/v1/deployments?limit=2000")
@ -1149,6 +1226,9 @@ def _postgres_connections(errors: list[str]) -> dict[str, Any]:
try:
postgres["used"] = _vm_scalar("sum(pg_stat_activity_count)")
postgres["max"] = _vm_scalar("max(pg_settings_max_connections)")
postgres["by_db"] = _vm_vector(
"topk(5, sum by (datname) (pg_stat_activity_count))"
)
postgres["hottest_db"] = _vm_topk(
"topk(1, sum by (datname) (pg_stat_activity_count))",
"datname",
@ -1271,6 +1351,11 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
metrics["top_restarts_1h"] = _vm_vector(
f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
)
metrics["restart_namespace_top"] = _filter_namespace_vector(
_vm_vector(
f"topk(5, sum by (namespace) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
)
)
metrics["pod_cpu_top"] = _filter_namespace_vector(
_vm_vector(
f'topk(5, sum by (namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))'
@ -1369,6 +1454,7 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
kustomizations = _fetch_flux(errors)
workloads, namespace_pods, namespace_nodes, node_pods, pod_issues = _fetch_pods(errors)
jobs = _fetch_jobs(errors)
longhorn = _fetch_longhorn(errors)
workload_health = _fetch_workload_health(errors)
events = _fetch_events(errors)
@ -1386,6 +1472,7 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
"node_pods": node_pods,
"pod_issues": pod_issues,
"jobs": jobs,
"longhorn": longhorn,
"workloads_health": workload_health,
"events": events,
"metrics": metrics,