snapshot: add longhorn and disk headroom hints
This commit is contained in:
parent
e0c5f0a2f6
commit
26c6cea40b
@ -76,6 +76,7 @@ def build_summary(snapshot: dict[str, Any] | None) -> dict[str, Any]:
|
|||||||
summary.update(_build_nodes(snapshot))
|
summary.update(_build_nodes(snapshot))
|
||||||
summary.update(_build_pressure(snapshot))
|
summary.update(_build_pressure(snapshot))
|
||||||
summary.update(_build_hardware(nodes_detail))
|
summary.update(_build_hardware(nodes_detail))
|
||||||
|
summary.update(_build_hardware_by_node(nodes_detail))
|
||||||
summary.update(_build_node_ages(nodes_detail))
|
summary.update(_build_node_ages(nodes_detail))
|
||||||
summary.update(_build_node_taints(nodes_detail))
|
summary.update(_build_node_taints(nodes_detail))
|
||||||
summary.update(_build_capacity(metrics))
|
summary.update(_build_capacity(metrics))
|
||||||
@ -89,6 +90,8 @@ def build_summary(snapshot: dict[str, Any] | None) -> dict[str, Any]:
|
|||||||
summary.update(_build_postgres(metrics))
|
summary.update(_build_postgres(metrics))
|
||||||
summary.update(_build_hottest(metrics))
|
summary.update(_build_hottest(metrics))
|
||||||
summary.update(_build_pvc(metrics))
|
summary.update(_build_pvc(metrics))
|
||||||
|
summary.update(_build_longhorn(snapshot))
|
||||||
|
summary.update(_build_root_disk_headroom(metrics))
|
||||||
summary.update(_build_workloads(snapshot))
|
summary.update(_build_workloads(snapshot))
|
||||||
summary.update(_build_flux(snapshot))
|
summary.update(_build_flux(snapshot))
|
||||||
return summary
|
return summary
|
||||||
@ -139,6 +142,18 @@ def _build_hardware(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
|
|||||||
return {"hardware": {key: sorted(value) for key, value in hardware.items()}}
|
return {"hardware": {key: sorted(value) for key, value in hardware.items()}}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_hardware_by_node(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
|
||||||
|
mapping: dict[str, str] = {}
|
||||||
|
for node in nodes_detail or []:
|
||||||
|
if not isinstance(node, dict):
|
||||||
|
continue
|
||||||
|
name = node.get("name")
|
||||||
|
if isinstance(name, str) and name:
|
||||||
|
hardware = node.get("hardware") or "unknown"
|
||||||
|
mapping[name] = str(hardware)
|
||||||
|
return {"hardware_by_node": mapping} if mapping else {}
|
||||||
|
|
||||||
|
|
||||||
def _build_node_ages(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
|
def _build_node_ages(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
|
||||||
ages: list[dict[str, Any]] = []
|
ages: list[dict[str, Any]] = []
|
||||||
for node in nodes_detail or []:
|
for node in nodes_detail or []:
|
||||||
@ -174,6 +189,32 @@ def _build_node_taints(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
|
|||||||
return {"node_taints": {key: sorted(names) for key, names in taints.items()}}
|
return {"node_taints": {key: sorted(names) for key, names in taints.items()}}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_root_disk_headroom(metrics: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
node_usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
|
||||||
|
disk = node_usage.get("disk") if isinstance(node_usage.get("disk"), list) else []
|
||||||
|
if not disk:
|
||||||
|
return {}
|
||||||
|
entries = []
|
||||||
|
for entry in disk:
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
node = entry.get("node")
|
||||||
|
try:
|
||||||
|
used_pct = float(entry.get("value"))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
headroom = max(0.0, 100.0 - used_pct)
|
||||||
|
if node:
|
||||||
|
entries.append({"node": node, "headroom_pct": headroom, "used_pct": used_pct})
|
||||||
|
entries.sort(key=lambda item: (item.get("headroom_pct") or 0.0, item.get("node") or ""))
|
||||||
|
return {"root_disk_low_headroom": entries[:5]} if entries else {}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_longhorn(snapshot: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
longhorn = snapshot.get("longhorn")
|
||||||
|
return {"longhorn": longhorn} if isinstance(longhorn, dict) and longhorn else {}
|
||||||
|
|
||||||
|
|
||||||
def _build_pods(metrics: dict[str, Any]) -> dict[str, Any]:
|
def _build_pods(metrics: dict[str, Any]) -> dict[str, Any]:
|
||||||
pods = {
|
pods = {
|
||||||
"running": metrics.get("pods_running"),
|
"running": metrics.get("pods_running"),
|
||||||
@ -264,6 +305,7 @@ def _build_postgres(metrics: dict[str, Any]) -> dict[str, Any]:
|
|||||||
"used": postgres.get("used"),
|
"used": postgres.get("used"),
|
||||||
"max": postgres.get("max"),
|
"max": postgres.get("max"),
|
||||||
"hottest_db": postgres.get("hottest_db"),
|
"hottest_db": postgres.get("hottest_db"),
|
||||||
|
"by_db": postgres.get("by_db"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -556,6 +598,9 @@ def _append_pod_issues(lines: list[str], summary: dict[str, Any]) -> None:
|
|||||||
pending_line = _format_pod_pending_oldest(pod_issues)
|
pending_line = _format_pod_pending_oldest(pod_issues)
|
||||||
if pending_line:
|
if pending_line:
|
||||||
lines.append(pending_line)
|
lines.append(pending_line)
|
||||||
|
pending_over_line = _format_pod_pending_over_15m(pod_issues)
|
||||||
|
if pending_over_line:
|
||||||
|
lines.append(pending_over_line)
|
||||||
reasons_line = _format_pod_waiting_reasons(pod_issues)
|
reasons_line = _format_pod_waiting_reasons(pod_issues)
|
||||||
if reasons_line:
|
if reasons_line:
|
||||||
lines.append(reasons_line)
|
lines.append(reasons_line)
|
||||||
@ -618,6 +663,17 @@ def _format_pod_waiting_reasons(pod_issues: dict[str, Any]) -> str:
|
|||||||
return "pod_waiting_reasons: " + "; ".join([f"{key}={val}" for key, val in pairs])
|
return "pod_waiting_reasons: " + "; ".join([f"{key}={val}" for key, val in pairs])
|
||||||
|
|
||||||
|
|
||||||
|
def _format_pod_pending_over_15m(pod_issues: dict[str, Any]) -> str:
|
||||||
|
count = pod_issues.get("pending_over_15m")
|
||||||
|
if count is None:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
count_val = int(count)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return ""
|
||||||
|
return f"pods_pending_over_15m: {count_val}"
|
||||||
|
|
||||||
|
|
||||||
def _append_workload_health(lines: list[str], summary: dict[str, Any]) -> None:
|
def _append_workload_health(lines: list[str], summary: dict[str, Any]) -> None:
|
||||||
health = summary.get("workloads_health") if isinstance(summary.get("workloads_health"), dict) else {}
|
health = summary.get("workloads_health") if isinstance(summary.get("workloads_health"), dict) else {}
|
||||||
if not health:
|
if not health:
|
||||||
@ -689,6 +745,59 @@ def _append_pvc_usage(lines: list[str], summary: dict[str, Any]) -> None:
|
|||||||
lines.append("pvc_usage_top: " + "; ".join(parts))
|
lines.append("pvc_usage_top: " + "; ".join(parts))
|
||||||
|
|
||||||
|
|
||||||
|
def _append_root_disk_headroom(lines: list[str], summary: dict[str, Any]) -> None:
|
||||||
|
headroom = summary.get("root_disk_low_headroom")
|
||||||
|
if not isinstance(headroom, list) or not headroom:
|
||||||
|
return
|
||||||
|
parts = []
|
||||||
|
for entry in headroom:
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
node = entry.get("node")
|
||||||
|
headroom_pct = entry.get("headroom_pct")
|
||||||
|
if node and headroom_pct is not None:
|
||||||
|
parts.append(f"{node}={_format_float(headroom_pct)}%")
|
||||||
|
if parts:
|
||||||
|
lines.append("root_disk_low_headroom: " + "; ".join(parts))
|
||||||
|
|
||||||
|
|
||||||
|
def _append_longhorn(lines: list[str], summary: dict[str, Any]) -> None:
|
||||||
|
longhorn = summary.get("longhorn") if isinstance(summary.get("longhorn"), dict) else {}
|
||||||
|
if not longhorn:
|
||||||
|
return
|
||||||
|
total = longhorn.get("total")
|
||||||
|
unhealthy = longhorn.get("unhealthy_count")
|
||||||
|
by_state = longhorn.get("by_state") if isinstance(longhorn.get("by_state"), dict) else {}
|
||||||
|
by_robust = longhorn.get("by_robustness") if isinstance(longhorn.get("by_robustness"), dict) else {}
|
||||||
|
if total is not None:
|
||||||
|
lines.append(
|
||||||
|
"longhorn: total={total}, unhealthy={unhealthy}".format(
|
||||||
|
total=total,
|
||||||
|
unhealthy=unhealthy if unhealthy is not None else 0,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if by_state:
|
||||||
|
lines.append("longhorn_state: " + _format_kv_map(by_state))
|
||||||
|
if by_robust:
|
||||||
|
lines.append("longhorn_robustness: " + _format_kv_map(by_robust))
|
||||||
|
unhealthy_items = longhorn.get("unhealthy")
|
||||||
|
if isinstance(unhealthy_items, list) and unhealthy_items:
|
||||||
|
parts = []
|
||||||
|
for entry in unhealthy_items[:5]:
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
name = entry.get("name")
|
||||||
|
state = entry.get("state")
|
||||||
|
robustness = entry.get("robustness")
|
||||||
|
if name:
|
||||||
|
label = name
|
||||||
|
if state or robustness:
|
||||||
|
label = f"{label}({state},{robustness})"
|
||||||
|
parts.append(label)
|
||||||
|
if parts:
|
||||||
|
lines.append("longhorn_unhealthy_top: " + "; ".join(parts))
|
||||||
|
|
||||||
|
|
||||||
def _append_namespace_usage(lines: list[str], summary: dict[str, Any]) -> None:
|
def _append_namespace_usage(lines: list[str], summary: dict[str, Any]) -> None:
|
||||||
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
|
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
|
||||||
cpu_top = metrics.get("namespace_cpu_top") if isinstance(metrics.get("namespace_cpu_top"), list) else []
|
cpu_top = metrics.get("namespace_cpu_top") if isinstance(metrics.get("namespace_cpu_top"), list) else []
|
||||||
@ -799,7 +908,7 @@ def _append_restarts(lines: list[str], summary: dict[str, Any]) -> None:
|
|||||||
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
|
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
|
||||||
top_restarts = metrics.get("top_restarts_1h") or []
|
top_restarts = metrics.get("top_restarts_1h") or []
|
||||||
if not isinstance(top_restarts, list) or not top_restarts:
|
if not isinstance(top_restarts, list) or not top_restarts:
|
||||||
return
|
top_restarts = []
|
||||||
parts = []
|
parts = []
|
||||||
for entry in top_restarts:
|
for entry in top_restarts:
|
||||||
metric = entry.get("metric") if isinstance(entry, dict) else {}
|
metric = entry.get("metric") if isinstance(entry, dict) else {}
|
||||||
@ -813,6 +922,17 @@ def _append_restarts(lines: list[str], summary: dict[str, Any]) -> None:
|
|||||||
parts.append(f"{namespace}/{pod}={count}")
|
parts.append(f"{namespace}/{pod}={count}")
|
||||||
if parts:
|
if parts:
|
||||||
lines.append("restarts_1h_top: " + "; ".join(parts))
|
lines.append("restarts_1h_top: " + "; ".join(parts))
|
||||||
|
ns_top = metrics.get("restart_namespace_top") or []
|
||||||
|
if isinstance(ns_top, list) and ns_top:
|
||||||
|
ns_parts = []
|
||||||
|
for entry in ns_top:
|
||||||
|
metric = entry.get("metric") if isinstance(entry, dict) else {}
|
||||||
|
value = entry.get("value")
|
||||||
|
namespace = metric.get("namespace") if isinstance(metric, dict) else None
|
||||||
|
if namespace and value is not None:
|
||||||
|
ns_parts.append(f"{namespace}={_format_float(value)}")
|
||||||
|
if ns_parts:
|
||||||
|
lines.append("restarts_1h_namespace_top: " + "; ".join(ns_parts))
|
||||||
|
|
||||||
|
|
||||||
def _append_job_failures(lines: list[str], summary: dict[str, Any]) -> None:
|
def _append_job_failures(lines: list[str], summary: dict[str, Any]) -> None:
|
||||||
@ -907,23 +1027,41 @@ def _append_postgres(lines: list[str], summary: dict[str, Any]) -> None:
|
|||||||
hottest=hottest,
|
hottest=hottest,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
by_db = postgres.get("by_db")
|
||||||
|
if isinstance(by_db, list) and by_db:
|
||||||
|
parts = []
|
||||||
|
for entry in by_db:
|
||||||
|
metric = entry.get("metric") if isinstance(entry, dict) else {}
|
||||||
|
value = entry.get("value")
|
||||||
|
name = metric.get("datname") if isinstance(metric, dict) else None
|
||||||
|
if name and value is not None:
|
||||||
|
parts.append(f"{name}={_format_float(value)}")
|
||||||
|
if parts:
|
||||||
|
lines.append("postgres_connections_by_db: " + "; ".join(parts))
|
||||||
|
|
||||||
|
|
||||||
def _append_hottest(lines: list[str], summary: dict[str, Any]) -> None:
|
def _append_hottest(lines: list[str], summary: dict[str, Any]) -> None:
|
||||||
hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {}
|
hottest = summary.get("hottest") if isinstance(summary.get("hottest"), dict) else {}
|
||||||
if not hottest:
|
if not hottest:
|
||||||
return
|
return
|
||||||
|
hardware_map = summary.get("hardware_by_node")
|
||||||
|
if not isinstance(hardware_map, dict):
|
||||||
|
hardware_map = {}
|
||||||
parts = []
|
parts = []
|
||||||
for key, entry in hottest.items():
|
for key, entry in hottest.items():
|
||||||
if not isinstance(entry, dict):
|
if not isinstance(entry, dict):
|
||||||
continue
|
continue
|
||||||
node = entry.get("node")
|
node = entry.get("node")
|
||||||
|
hardware = hardware_map.get(node) if node else None
|
||||||
if key in {"net", "io"}:
|
if key in {"net", "io"}:
|
||||||
value = _format_rate_bytes(entry.get("value"))
|
value = _format_rate_bytes(entry.get("value"))
|
||||||
else:
|
else:
|
||||||
value = _format_float(entry.get("value"))
|
value = _format_float(entry.get("value"))
|
||||||
if node:
|
if node:
|
||||||
parts.append(f"{key}={node} ({value})")
|
label = node
|
||||||
|
if hardware:
|
||||||
|
label = f"{label} [{hardware}]"
|
||||||
|
parts.append(f"{key}={label} ({value})")
|
||||||
if parts:
|
if parts:
|
||||||
lines.append("hottest: " + "; ".join(parts))
|
lines.append("hottest: " + "; ".join(parts))
|
||||||
|
|
||||||
@ -1006,6 +1144,8 @@ def summary_text(snapshot: dict[str, Any] | None) -> str:
|
|||||||
_append_postgres(lines, summary)
|
_append_postgres(lines, summary)
|
||||||
_append_hottest(lines, summary)
|
_append_hottest(lines, summary)
|
||||||
_append_pvc_usage(lines, summary)
|
_append_pvc_usage(lines, summary)
|
||||||
|
_append_root_disk_headroom(lines, summary)
|
||||||
|
_append_longhorn(lines, summary)
|
||||||
_append_workloads(lines, summary)
|
_append_workloads(lines, summary)
|
||||||
_append_flux(lines, summary)
|
_append_flux(lines, summary)
|
||||||
_append_units_windows(lines, summary)
|
_append_units_windows(lines, summary)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user