cluster_state: add delta and pod issue summaries
This commit is contained in:
parent
bdb94ffbe1
commit
5cf80feb33
@ -103,6 +103,8 @@ _POD_WAITING_REASONS = {
|
|||||||
"err_image_pull": "ErrImagePull",
|
"err_image_pull": "ErrImagePull",
|
||||||
"create_config_error": "CreateContainerConfigError",
|
"create_config_error": "CreateContainerConfigError",
|
||||||
}
|
}
|
||||||
|
_DELTA_TOP_LIMIT = 6
|
||||||
|
_REASON_TOP_LIMIT = 5
|
||||||
|
|
||||||
|
|
||||||
def _node_usage_by_hardware(node_load: list[dict[str, Any]], node_details: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
def _node_usage_by_hardware(node_load: list[dict[str, Any]], node_details: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||||
@ -2860,6 +2862,53 @@ def _delta_severity(delta: float) -> str:
|
|||||||
return "info"
|
return "info"
|
||||||
|
|
||||||
|
|
||||||
|
def _delta_entry_label(entry: dict[str, Any]) -> tuple[str, str]:
|
||||||
|
if "node" in entry:
|
||||||
|
return ("node", str(entry.get("node") or ""))
|
||||||
|
return ("namespace", str(entry.get("namespace") or ""))
|
||||||
|
|
||||||
|
|
||||||
|
def _delta_top(entries: list[dict[str, Any]], key: str, limit: int = _DELTA_TOP_LIMIT) -> list[dict[str, Any]]:
|
||||||
|
output: list[dict[str, Any]] = []
|
||||||
|
for entry in entries:
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
deltas = entry.get("baseline_delta") if isinstance(entry.get("baseline_delta"), dict) else {}
|
||||||
|
delta = deltas.get(key)
|
||||||
|
if not isinstance(delta, (int, float)):
|
||||||
|
continue
|
||||||
|
label_key, label_value = _delta_entry_label(entry)
|
||||||
|
output.append(
|
||||||
|
{
|
||||||
|
label_key: label_value,
|
||||||
|
"metric": key,
|
||||||
|
"delta": delta,
|
||||||
|
"severity": _delta_severity(float(delta)),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
output.sort(key=lambda item: (-(abs(item.get("delta") or 0)), item.get("metric") or ""))
|
||||||
|
return output[:limit]
|
||||||
|
|
||||||
|
|
||||||
|
def _reason_top(counts: dict[str, Any], limit: int = _REASON_TOP_LIMIT) -> list[dict[str, Any]]:
|
||||||
|
output: list[dict[str, Any]] = []
|
||||||
|
for reason, value in counts.items() if isinstance(counts, dict) else []:
|
||||||
|
if isinstance(reason, str) and reason and isinstance(value, (int, float)):
|
||||||
|
output.append({"reason": reason, "count": int(value)})
|
||||||
|
output.sort(key=lambda item: (-item.get("count", 0), item.get("reason") or ""))
|
||||||
|
return output[:limit]
|
||||||
|
|
||||||
|
|
||||||
|
def _pod_issue_summary(pod_issues: dict[str, Any], metrics: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
waiting = pod_issues.get("waiting_reasons") if isinstance(pod_issues, dict) else {}
|
||||||
|
phase = pod_issues.get("phase_reasons") if isinstance(pod_issues, dict) else {}
|
||||||
|
return {
|
||||||
|
"waiting_reasons_top": _reason_top(waiting),
|
||||||
|
"phase_reasons_top": _reason_top(phase),
|
||||||
|
"namespace_issue_top": metrics.get("namespace_issue_top") or {},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _delta_hit(delta: Any) -> bool:
|
def _delta_hit(delta: Any) -> bool:
|
||||||
if not isinstance(delta, (int, float)):
|
if not isinstance(delta, (int, float)):
|
||||||
return False
|
return False
|
||||||
@ -3546,9 +3595,13 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
|
|||||||
},
|
},
|
||||||
"pressure_summary": pressure_summary,
|
"pressure_summary": pressure_summary,
|
||||||
"trend_summary": metrics.get("trend_summary"),
|
"trend_summary": metrics.get("trend_summary"),
|
||||||
|
"trend_requests": metrics.get("namespace_request_trends"),
|
||||||
"time_series": metrics.get("cluster_trends"),
|
"time_series": metrics.get("cluster_trends"),
|
||||||
"node_condition_trends": metrics.get("node_condition_trends"),
|
"node_condition_trends": metrics.get("node_condition_trends"),
|
||||||
|
"pod_waiting_trends": metrics.get("pod_waiting_trends"),
|
||||||
|
"pod_terminated_trends": metrics.get("pod_terminated_trends"),
|
||||||
"pod_reason_totals": metrics.get("pod_reason_totals"),
|
"pod_reason_totals": metrics.get("pod_reason_totals"),
|
||||||
|
"pod_issue_summary": _pod_issue_summary(pod_issues, metrics),
|
||||||
"offenders": _build_offenders(metrics),
|
"offenders": _build_offenders(metrics),
|
||||||
"alerts": metrics.get("alerts", {}),
|
"alerts": metrics.get("alerts", {}),
|
||||||
"top": {
|
"top": {
|
||||||
@ -3573,6 +3626,19 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
|
|||||||
"workloads": _workload_index(workloads),
|
"workloads": _workload_index(workloads),
|
||||||
"namespaces": _namespace_nodes_top(namespace_context, 12),
|
"namespaces": _namespace_nodes_top(namespace_context, 12),
|
||||||
},
|
},
|
||||||
|
"baseline_deltas": {
|
||||||
|
"nodes": {
|
||||||
|
"cpu": _delta_top(node_context, "cpu"),
|
||||||
|
"ram": _delta_top(node_context, "ram"),
|
||||||
|
"net": _delta_top(node_context, "net"),
|
||||||
|
"io": _delta_top(node_context, "io"),
|
||||||
|
"disk": _delta_top(node_context, "disk"),
|
||||||
|
},
|
||||||
|
"namespaces": {
|
||||||
|
"cpu": _delta_top(namespace_context, "cpu"),
|
||||||
|
"mem": _delta_top(namespace_context, "mem"),
|
||||||
|
},
|
||||||
|
},
|
||||||
"attention_ranked": _build_attention_ranked(metrics, node_context, pod_issues, workload_health),
|
"attention_ranked": _build_attention_ranked(metrics, node_context, pod_issues, workload_health),
|
||||||
"signals": signals,
|
"signals": signals,
|
||||||
"profiles": profiles,
|
"profiles": profiles,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user