snapshot: add longhorn volume summary
This commit is contained in:
parent
c9708a83ea
commit
a6062be60e
@ -61,6 +61,7 @@ _PHASE_SEVERITY = {
|
|||||||
"Pending": 2,
|
"Pending": 2,
|
||||||
"Unknown": 1,
|
"Unknown": 1,
|
||||||
}
|
}
|
||||||
|
_PENDING_15M_HOURS = 0.25
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
@ -711,13 +712,21 @@ def _node_pod_finalize(nodes: dict[str, dict[str, Any]]) -> list[dict[str, Any]]
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
|
def _record_pending_pod(
|
||||||
items: list[dict[str, Any]] = []
|
pending_oldest: list[dict[str, Any]],
|
||||||
counts: dict[str, int] = {key: 0 for key in _PHASE_SEVERITY}
|
info: dict[str, Any],
|
||||||
pending_oldest: list[dict[str, Any]] = []
|
) -> bool:
|
||||||
waiting_reason_counts: dict[str, int] = {}
|
age_hours = info.get("age_hours")
|
||||||
phase_reason_counts: dict[str, int] = {}
|
if age_hours is None:
|
||||||
for pod in _items(payload):
|
return False
|
||||||
|
pending_oldest.append(info)
|
||||||
|
return age_hours >= _PENDING_15M_HOURS
|
||||||
|
|
||||||
|
|
||||||
|
def _update_pod_issue(
|
||||||
|
pod: dict[str, Any],
|
||||||
|
acc: dict[str, Any],
|
||||||
|
) -> None:
|
||||||
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
|
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
|
||||||
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
|
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
|
||||||
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
|
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
|
||||||
@ -730,7 +739,7 @@ def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
|
|||||||
)
|
)
|
||||||
age_hours = _age_hours(created_at)
|
age_hours = _age_hours(created_at)
|
||||||
if not name or not namespace:
|
if not name or not namespace:
|
||||||
continue
|
return
|
||||||
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
|
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
|
||||||
restarts = 0
|
restarts = 0
|
||||||
waiting_reasons: list[str] = []
|
waiting_reasons: list[str] = []
|
||||||
@ -743,14 +752,14 @@ def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
|
|||||||
reason = waiting.get("reason")
|
reason = waiting.get("reason")
|
||||||
if isinstance(reason, str) and reason:
|
if isinstance(reason, str) and reason:
|
||||||
waiting_reasons.append(reason)
|
waiting_reasons.append(reason)
|
||||||
waiting_reason_counts[reason] = waiting_reason_counts.get(reason, 0) + 1
|
acc["waiting_reasons"][reason] = acc["waiting_reasons"].get(reason, 0) + 1
|
||||||
phase_reason = status.get("reason")
|
phase_reason = status.get("reason")
|
||||||
if isinstance(phase_reason, str) and phase_reason:
|
if isinstance(phase_reason, str) and phase_reason:
|
||||||
phase_reason_counts[phase_reason] = phase_reason_counts.get(phase_reason, 0) + 1
|
acc["phase_reasons"][phase_reason] = acc["phase_reasons"].get(phase_reason, 0) + 1
|
||||||
if phase in counts:
|
if phase in acc["counts"]:
|
||||||
counts[phase] += 1
|
acc["counts"][phase] += 1
|
||||||
if phase in _PHASE_SEVERITY or restarts > 0:
|
if phase in _PHASE_SEVERITY or restarts > 0:
|
||||||
items.append(
|
acc["items"].append(
|
||||||
{
|
{
|
||||||
"namespace": namespace,
|
"namespace": namespace,
|
||||||
"pod": name,
|
"pod": name,
|
||||||
@ -763,16 +772,31 @@ def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
|
|||||||
"age_hours": age_hours,
|
"age_hours": age_hours,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
if phase == "Pending" and age_hours is not None:
|
if phase == "Pending":
|
||||||
pending_oldest.append(
|
info = {
|
||||||
{
|
|
||||||
"namespace": namespace,
|
"namespace": namespace,
|
||||||
"pod": name,
|
"pod": name,
|
||||||
"node": spec.get("nodeName") or "",
|
"node": spec.get("nodeName") or "",
|
||||||
"age_hours": age_hours,
|
"age_hours": age_hours,
|
||||||
"reason": status.get("reason") or "",
|
"reason": status.get("reason") or "",
|
||||||
}
|
}
|
||||||
)
|
if _record_pending_pod(acc["pending_oldest"], info):
|
||||||
|
acc["pending_over_15m"] += 1
|
||||||
|
|
||||||
|
|
||||||
|
def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
acc = {
|
||||||
|
"items": [],
|
||||||
|
"counts": {key: 0 for key in _PHASE_SEVERITY},
|
||||||
|
"pending_oldest": [],
|
||||||
|
"pending_over_15m": 0,
|
||||||
|
"waiting_reasons": {},
|
||||||
|
"phase_reasons": {},
|
||||||
|
}
|
||||||
|
for pod in _items(payload):
|
||||||
|
if isinstance(pod, dict):
|
||||||
|
_update_pod_issue(pod, acc)
|
||||||
|
items = acc["items"]
|
||||||
items.sort(
|
items.sort(
|
||||||
key=lambda item: (
|
key=lambda item: (
|
||||||
-_PHASE_SEVERITY.get(item.get("phase") or "", 0),
|
-_PHASE_SEVERITY.get(item.get("phase") or "", 0),
|
||||||
@ -781,13 +805,15 @@ def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
|
|||||||
item.get("pod") or "",
|
item.get("pod") or "",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
pending_oldest = acc["pending_oldest"]
|
||||||
pending_oldest.sort(key=lambda item: -(item.get("age_hours") or 0.0))
|
pending_oldest.sort(key=lambda item: -(item.get("age_hours") or 0.0))
|
||||||
return {
|
return {
|
||||||
"counts": counts,
|
"counts": acc["counts"],
|
||||||
"items": items[:20],
|
"items": items[:20],
|
||||||
"pending_oldest": pending_oldest[:10],
|
"pending_oldest": pending_oldest[:10],
|
||||||
"waiting_reasons": waiting_reason_counts,
|
"pending_over_15m": acc["pending_over_15m"],
|
||||||
"phase_reasons": phase_reason_counts,
|
"waiting_reasons": acc["waiting_reasons"],
|
||||||
|
"phase_reasons": acc["phase_reasons"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -1036,6 +1062,57 @@ def _fetch_jobs(errors: list[str]) -> dict[str, Any]:
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _summarize_longhorn_volumes(payload: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
items = _items(payload)
|
||||||
|
if not items:
|
||||||
|
return {}
|
||||||
|
by_state: dict[str, int] = {}
|
||||||
|
by_robustness: dict[str, int] = {}
|
||||||
|
unhealthy: list[dict[str, Any]] = []
|
||||||
|
for volume in items:
|
||||||
|
metadata = volume.get("metadata") if isinstance(volume.get("metadata"), dict) else {}
|
||||||
|
status = volume.get("status") if isinstance(volume.get("status"), dict) else {}
|
||||||
|
spec = volume.get("spec") if isinstance(volume.get("spec"), dict) else {}
|
||||||
|
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
state = status.get("state") if isinstance(status.get("state"), str) else "unknown"
|
||||||
|
robustness = (
|
||||||
|
status.get("robustness") if isinstance(status.get("robustness"), str) else "unknown"
|
||||||
|
)
|
||||||
|
by_state[state] = by_state.get(state, 0) + 1
|
||||||
|
by_robustness[robustness] = by_robustness.get(robustness, 0) + 1
|
||||||
|
if state.lower() != "attached" or robustness.lower() != "healthy":
|
||||||
|
unhealthy.append(
|
||||||
|
{
|
||||||
|
"name": name,
|
||||||
|
"state": state,
|
||||||
|
"robustness": robustness,
|
||||||
|
"size": spec.get("size"),
|
||||||
|
"actual_size": status.get("actualSize"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
unhealthy.sort(key=lambda item: item.get("name") or "")
|
||||||
|
return {
|
||||||
|
"total": len(items),
|
||||||
|
"by_state": by_state,
|
||||||
|
"by_robustness": by_robustness,
|
||||||
|
"unhealthy": unhealthy,
|
||||||
|
"unhealthy_count": len(unhealthy),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_longhorn(errors: list[str]) -> dict[str, Any]:
|
||||||
|
try:
|
||||||
|
payload = get_json(
|
||||||
|
"/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes"
|
||||||
|
)
|
||||||
|
return _summarize_longhorn_volumes(payload)
|
||||||
|
except Exception as exc:
|
||||||
|
errors.append(f"longhorn: {exc}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def _fetch_workload_health(errors: list[str]) -> dict[str, Any]:
|
def _fetch_workload_health(errors: list[str]) -> dict[str, Any]:
|
||||||
try:
|
try:
|
||||||
deployments_payload = get_json("/apis/apps/v1/deployments?limit=2000")
|
deployments_payload = get_json("/apis/apps/v1/deployments?limit=2000")
|
||||||
@ -1149,6 +1226,9 @@ def _postgres_connections(errors: list[str]) -> dict[str, Any]:
|
|||||||
try:
|
try:
|
||||||
postgres["used"] = _vm_scalar("sum(pg_stat_activity_count)")
|
postgres["used"] = _vm_scalar("sum(pg_stat_activity_count)")
|
||||||
postgres["max"] = _vm_scalar("max(pg_settings_max_connections)")
|
postgres["max"] = _vm_scalar("max(pg_settings_max_connections)")
|
||||||
|
postgres["by_db"] = _vm_vector(
|
||||||
|
"topk(5, sum by (datname) (pg_stat_activity_count))"
|
||||||
|
)
|
||||||
postgres["hottest_db"] = _vm_topk(
|
postgres["hottest_db"] = _vm_topk(
|
||||||
"topk(1, sum by (datname) (pg_stat_activity_count))",
|
"topk(1, sum by (datname) (pg_stat_activity_count))",
|
||||||
"datname",
|
"datname",
|
||||||
@ -1271,6 +1351,11 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
|||||||
metrics["top_restarts_1h"] = _vm_vector(
|
metrics["top_restarts_1h"] = _vm_vector(
|
||||||
f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
|
f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
|
||||||
)
|
)
|
||||||
|
metrics["restart_namespace_top"] = _filter_namespace_vector(
|
||||||
|
_vm_vector(
|
||||||
|
f"topk(5, sum by (namespace) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
|
||||||
|
)
|
||||||
|
)
|
||||||
metrics["pod_cpu_top"] = _filter_namespace_vector(
|
metrics["pod_cpu_top"] = _filter_namespace_vector(
|
||||||
_vm_vector(
|
_vm_vector(
|
||||||
f'topk(5, sum by (namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))'
|
f'topk(5, sum by (namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))'
|
||||||
@ -1369,6 +1454,7 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
|
|||||||
kustomizations = _fetch_flux(errors)
|
kustomizations = _fetch_flux(errors)
|
||||||
workloads, namespace_pods, namespace_nodes, node_pods, pod_issues = _fetch_pods(errors)
|
workloads, namespace_pods, namespace_nodes, node_pods, pod_issues = _fetch_pods(errors)
|
||||||
jobs = _fetch_jobs(errors)
|
jobs = _fetch_jobs(errors)
|
||||||
|
longhorn = _fetch_longhorn(errors)
|
||||||
workload_health = _fetch_workload_health(errors)
|
workload_health = _fetch_workload_health(errors)
|
||||||
events = _fetch_events(errors)
|
events = _fetch_events(errors)
|
||||||
|
|
||||||
@ -1386,6 +1472,7 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
|
|||||||
"node_pods": node_pods,
|
"node_pods": node_pods,
|
||||||
"pod_issues": pod_issues,
|
"pod_issues": pod_issues,
|
||||||
"jobs": jobs,
|
"jobs": jobs,
|
||||||
|
"longhorn": longhorn,
|
||||||
"workloads_health": workload_health,
|
"workloads_health": workload_health,
|
||||||
"events": events,
|
"events": events,
|
||||||
"metrics": metrics,
|
"metrics": metrics,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user