snapshot: add longhorn volume summary

This commit is contained in:
Brad Stein 2026-01-29 07:43:37 -03:00
parent c9708a83ea
commit a6062be60e

View File

@ -61,6 +61,7 @@ _PHASE_SEVERITY = {
"Pending": 2, "Pending": 2,
"Unknown": 1, "Unknown": 1,
} }
_PENDING_15M_HOURS = 0.25
@dataclass(frozen=True) @dataclass(frozen=True)
@ -711,68 +712,91 @@ def _node_pod_finalize(nodes: dict[str, dict[str, Any]]) -> list[dict[str, Any]]
return output return output
def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]: def _record_pending_pod(
items: list[dict[str, Any]] = [] pending_oldest: list[dict[str, Any]],
counts: dict[str, int] = {key: 0 for key in _PHASE_SEVERITY} info: dict[str, Any],
pending_oldest: list[dict[str, Any]] = [] ) -> bool:
waiting_reason_counts: dict[str, int] = {} age_hours = info.get("age_hours")
phase_reason_counts: dict[str, int] = {} if age_hours is None:
for pod in _items(payload): return False
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {} pending_oldest.append(info)
status = pod.get("status") if isinstance(pod.get("status"), dict) else {} return age_hours >= _PENDING_15M_HOURS
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
name = metadata.get("name") if isinstance(metadata.get("name"), str) else "" def _update_pod_issue(
created_at = ( pod: dict[str, Any],
metadata.get("creationTimestamp") acc: dict[str, Any],
if isinstance(metadata.get("creationTimestamp"), str) ) -> None:
else "" metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
) status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
age_hours = _age_hours(created_at) spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
if not name or not namespace: namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
created_at = (
metadata.get("creationTimestamp")
if isinstance(metadata.get("creationTimestamp"), str)
else ""
)
age_hours = _age_hours(created_at)
if not name or not namespace:
return
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
restarts = 0
waiting_reasons: list[str] = []
for container in status.get("containerStatuses") or []:
if not isinstance(container, dict):
continue continue
phase = status.get("phase") if isinstance(status.get("phase"), str) else "" restarts += int(container.get("restartCount") or 0)
restarts = 0 state = container.get("state") if isinstance(container.get("state"), dict) else {}
waiting_reasons: list[str] = [] waiting = state.get("waiting") if isinstance(state.get("waiting"), dict) else {}
for container in status.get("containerStatuses") or []: reason = waiting.get("reason")
if not isinstance(container, dict): if isinstance(reason, str) and reason:
continue waiting_reasons.append(reason)
restarts += int(container.get("restartCount") or 0) acc["waiting_reasons"][reason] = acc["waiting_reasons"].get(reason, 0) + 1
state = container.get("state") if isinstance(container.get("state"), dict) else {} phase_reason = status.get("reason")
waiting = state.get("waiting") if isinstance(state.get("waiting"), dict) else {} if isinstance(phase_reason, str) and phase_reason:
reason = waiting.get("reason") acc["phase_reasons"][phase_reason] = acc["phase_reasons"].get(phase_reason, 0) + 1
if isinstance(reason, str) and reason: if phase in acc["counts"]:
waiting_reasons.append(reason) acc["counts"][phase] += 1
waiting_reason_counts[reason] = waiting_reason_counts.get(reason, 0) + 1 if phase in _PHASE_SEVERITY or restarts > 0:
phase_reason = status.get("reason") acc["items"].append(
if isinstance(phase_reason, str) and phase_reason: {
phase_reason_counts[phase_reason] = phase_reason_counts.get(phase_reason, 0) + 1 "namespace": namespace,
if phase in counts: "pod": name,
counts[phase] += 1 "node": spec.get("nodeName") or "",
if phase in _PHASE_SEVERITY or restarts > 0: "phase": phase,
items.append( "reason": status.get("reason") or "",
{ "restarts": restarts,
"namespace": namespace, "waiting_reasons": sorted(set(waiting_reasons)),
"pod": name, "created_at": created_at,
"node": spec.get("nodeName") or "", "age_hours": age_hours,
"phase": phase, }
"reason": status.get("reason") or "", )
"restarts": restarts, if phase == "Pending":
"waiting_reasons": sorted(set(waiting_reasons)), info = {
"created_at": created_at, "namespace": namespace,
"age_hours": age_hours, "pod": name,
} "node": spec.get("nodeName") or "",
) "age_hours": age_hours,
if phase == "Pending" and age_hours is not None: "reason": status.get("reason") or "",
pending_oldest.append( }
{ if _record_pending_pod(acc["pending_oldest"], info):
"namespace": namespace, acc["pending_over_15m"] += 1
"pod": name,
"node": spec.get("nodeName") or "",
"age_hours": age_hours, def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
"reason": status.get("reason") or "", acc = {
} "items": [],
) "counts": {key: 0 for key in _PHASE_SEVERITY},
"pending_oldest": [],
"pending_over_15m": 0,
"waiting_reasons": {},
"phase_reasons": {},
}
for pod in _items(payload):
if isinstance(pod, dict):
_update_pod_issue(pod, acc)
items = acc["items"]
items.sort( items.sort(
key=lambda item: ( key=lambda item: (
-_PHASE_SEVERITY.get(item.get("phase") or "", 0), -_PHASE_SEVERITY.get(item.get("phase") or "", 0),
@ -781,13 +805,15 @@ def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
item.get("pod") or "", item.get("pod") or "",
) )
) )
pending_oldest = acc["pending_oldest"]
pending_oldest.sort(key=lambda item: -(item.get("age_hours") or 0.0)) pending_oldest.sort(key=lambda item: -(item.get("age_hours") or 0.0))
return { return {
"counts": counts, "counts": acc["counts"],
"items": items[:20], "items": items[:20],
"pending_oldest": pending_oldest[:10], "pending_oldest": pending_oldest[:10],
"waiting_reasons": waiting_reason_counts, "pending_over_15m": acc["pending_over_15m"],
"phase_reasons": phase_reason_counts, "waiting_reasons": acc["waiting_reasons"],
"phase_reasons": acc["phase_reasons"],
} }
@ -1036,6 +1062,57 @@ def _fetch_jobs(errors: list[str]) -> dict[str, Any]:
return {} return {}
def _summarize_longhorn_volumes(payload: dict[str, Any]) -> dict[str, Any]:
items = _items(payload)
if not items:
return {}
by_state: dict[str, int] = {}
by_robustness: dict[str, int] = {}
unhealthy: list[dict[str, Any]] = []
for volume in items:
metadata = volume.get("metadata") if isinstance(volume.get("metadata"), dict) else {}
status = volume.get("status") if isinstance(volume.get("status"), dict) else {}
spec = volume.get("spec") if isinstance(volume.get("spec"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
if not name:
continue
state = status.get("state") if isinstance(status.get("state"), str) else "unknown"
robustness = (
status.get("robustness") if isinstance(status.get("robustness"), str) else "unknown"
)
by_state[state] = by_state.get(state, 0) + 1
by_robustness[robustness] = by_robustness.get(robustness, 0) + 1
if state.lower() != "attached" or robustness.lower() != "healthy":
unhealthy.append(
{
"name": name,
"state": state,
"robustness": robustness,
"size": spec.get("size"),
"actual_size": status.get("actualSize"),
}
)
unhealthy.sort(key=lambda item: item.get("name") or "")
return {
"total": len(items),
"by_state": by_state,
"by_robustness": by_robustness,
"unhealthy": unhealthy,
"unhealthy_count": len(unhealthy),
}
def _fetch_longhorn(errors: list[str]) -> dict[str, Any]:
try:
payload = get_json(
"/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes"
)
return _summarize_longhorn_volumes(payload)
except Exception as exc:
errors.append(f"longhorn: {exc}")
return {}
def _fetch_workload_health(errors: list[str]) -> dict[str, Any]: def _fetch_workload_health(errors: list[str]) -> dict[str, Any]:
try: try:
deployments_payload = get_json("/apis/apps/v1/deployments?limit=2000") deployments_payload = get_json("/apis/apps/v1/deployments?limit=2000")
@ -1149,6 +1226,9 @@ def _postgres_connections(errors: list[str]) -> dict[str, Any]:
try: try:
postgres["used"] = _vm_scalar("sum(pg_stat_activity_count)") postgres["used"] = _vm_scalar("sum(pg_stat_activity_count)")
postgres["max"] = _vm_scalar("max(pg_settings_max_connections)") postgres["max"] = _vm_scalar("max(pg_settings_max_connections)")
postgres["by_db"] = _vm_vector(
"topk(5, sum by (datname) (pg_stat_activity_count))"
)
postgres["hottest_db"] = _vm_topk( postgres["hottest_db"] = _vm_topk(
"topk(1, sum by (datname) (pg_stat_activity_count))", "topk(1, sum by (datname) (pg_stat_activity_count))",
"datname", "datname",
@ -1271,6 +1351,11 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
metrics["top_restarts_1h"] = _vm_vector( metrics["top_restarts_1h"] = _vm_vector(
f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))" f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
) )
metrics["restart_namespace_top"] = _filter_namespace_vector(
_vm_vector(
f"topk(5, sum by (namespace) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
)
)
metrics["pod_cpu_top"] = _filter_namespace_vector( metrics["pod_cpu_top"] = _filter_namespace_vector(
_vm_vector( _vm_vector(
f'topk(5, sum by (namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))' f'topk(5, sum by (namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))'
@ -1369,6 +1454,7 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
kustomizations = _fetch_flux(errors) kustomizations = _fetch_flux(errors)
workloads, namespace_pods, namespace_nodes, node_pods, pod_issues = _fetch_pods(errors) workloads, namespace_pods, namespace_nodes, node_pods, pod_issues = _fetch_pods(errors)
jobs = _fetch_jobs(errors) jobs = _fetch_jobs(errors)
longhorn = _fetch_longhorn(errors)
workload_health = _fetch_workload_health(errors) workload_health = _fetch_workload_health(errors)
events = _fetch_events(errors) events = _fetch_events(errors)
@ -1386,6 +1472,7 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
"node_pods": node_pods, "node_pods": node_pods,
"pod_issues": pod_issues, "pod_issues": pod_issues,
"jobs": jobs, "jobs": jobs,
"longhorn": longhorn,
"workloads_health": workload_health, "workloads_health": workload_health,
"events": events, "events": events,
"metrics": metrics, "metrics": metrics,