cluster_state: add delta and pod issue summaries

This commit is contained in:
Brad Stein 2026-02-03 22:48:16 -03:00
parent bdb94ffbe1
commit 5cf80feb33

View File

@ -103,6 +103,8 @@ _POD_WAITING_REASONS = {
"err_image_pull": "ErrImagePull", "err_image_pull": "ErrImagePull",
"create_config_error": "CreateContainerConfigError", "create_config_error": "CreateContainerConfigError",
} }
_DELTA_TOP_LIMIT = 6
_REASON_TOP_LIMIT = 5
def _node_usage_by_hardware(node_load: list[dict[str, Any]], node_details: list[dict[str, Any]]) -> list[dict[str, Any]]: def _node_usage_by_hardware(node_load: list[dict[str, Any]], node_details: list[dict[str, Any]]) -> list[dict[str, Any]]:
@ -2860,6 +2862,53 @@ def _delta_severity(delta: float) -> str:
return "info" return "info"
def _delta_entry_label(entry: dict[str, Any]) -> tuple[str, str]:
if "node" in entry:
return ("node", str(entry.get("node") or ""))
return ("namespace", str(entry.get("namespace") or ""))
def _delta_top(entries: list[dict[str, Any]], key: str, limit: int = _DELTA_TOP_LIMIT) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for entry in entries:
if not isinstance(entry, dict):
continue
deltas = entry.get("baseline_delta") if isinstance(entry.get("baseline_delta"), dict) else {}
delta = deltas.get(key)
if not isinstance(delta, (int, float)):
continue
label_key, label_value = _delta_entry_label(entry)
output.append(
{
label_key: label_value,
"metric": key,
"delta": delta,
"severity": _delta_severity(float(delta)),
}
)
output.sort(key=lambda item: (-(abs(item.get("delta") or 0)), item.get("metric") or ""))
return output[:limit]
def _reason_top(counts: dict[str, Any], limit: int = _REASON_TOP_LIMIT) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for reason, value in counts.items() if isinstance(counts, dict) else []:
if isinstance(reason, str) and reason and isinstance(value, (int, float)):
output.append({"reason": reason, "count": int(value)})
output.sort(key=lambda item: (-item.get("count", 0), item.get("reason") or ""))
return output[:limit]
def _pod_issue_summary(pod_issues: dict[str, Any], metrics: dict[str, Any]) -> dict[str, Any]:
waiting = pod_issues.get("waiting_reasons") if isinstance(pod_issues, dict) else {}
phase = pod_issues.get("phase_reasons") if isinstance(pod_issues, dict) else {}
return {
"waiting_reasons_top": _reason_top(waiting),
"phase_reasons_top": _reason_top(phase),
"namespace_issue_top": metrics.get("namespace_issue_top") or {},
}
def _delta_hit(delta: Any) -> bool: def _delta_hit(delta: Any) -> bool:
if not isinstance(delta, (int, float)): if not isinstance(delta, (int, float)):
return False return False
@ -3546,9 +3595,13 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
}, },
"pressure_summary": pressure_summary, "pressure_summary": pressure_summary,
"trend_summary": metrics.get("trend_summary"), "trend_summary": metrics.get("trend_summary"),
"trend_requests": metrics.get("namespace_request_trends"),
"time_series": metrics.get("cluster_trends"), "time_series": metrics.get("cluster_trends"),
"node_condition_trends": metrics.get("node_condition_trends"), "node_condition_trends": metrics.get("node_condition_trends"),
"pod_waiting_trends": metrics.get("pod_waiting_trends"),
"pod_terminated_trends": metrics.get("pod_terminated_trends"),
"pod_reason_totals": metrics.get("pod_reason_totals"), "pod_reason_totals": metrics.get("pod_reason_totals"),
"pod_issue_summary": _pod_issue_summary(pod_issues, metrics),
"offenders": _build_offenders(metrics), "offenders": _build_offenders(metrics),
"alerts": metrics.get("alerts", {}), "alerts": metrics.get("alerts", {}),
"top": { "top": {
@ -3573,6 +3626,19 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
"workloads": _workload_index(workloads), "workloads": _workload_index(workloads),
"namespaces": _namespace_nodes_top(namespace_context, 12), "namespaces": _namespace_nodes_top(namespace_context, 12),
}, },
"baseline_deltas": {
"nodes": {
"cpu": _delta_top(node_context, "cpu"),
"ram": _delta_top(node_context, "ram"),
"net": _delta_top(node_context, "net"),
"io": _delta_top(node_context, "io"),
"disk": _delta_top(node_context, "disk"),
},
"namespaces": {
"cpu": _delta_top(namespace_context, "cpu"),
"mem": _delta_top(namespace_context, "mem"),
},
},
"attention_ranked": _build_attention_ranked(metrics, node_context, pod_issues, workload_health), "attention_ranked": _build_attention_ranked(metrics, node_context, pod_issues, workload_health),
"signals": signals, "signals": signals,
"profiles": profiles, "profiles": profiles,