diff --git a/ariadne/services/cluster_state.py b/ariadne/services/cluster_state.py index 461af6a..fed9027 100644 --- a/ariadne/services/cluster_state.py +++ b/ariadne/services/cluster_state.py @@ -103,6 +103,8 @@ _POD_WAITING_REASONS = { "err_image_pull": "ErrImagePull", "create_config_error": "CreateContainerConfigError", } +_DELTA_TOP_LIMIT = 6 +_REASON_TOP_LIMIT = 5 def _node_usage_by_hardware(node_load: list[dict[str, Any]], node_details: list[dict[str, Any]]) -> list[dict[str, Any]]: @@ -2860,6 +2862,53 @@ def _delta_severity(delta: float) -> str: return "info" +def _delta_entry_label(entry: dict[str, Any]) -> tuple[str, str]: + if "node" in entry: + return ("node", str(entry.get("node") or "")) + return ("namespace", str(entry.get("namespace") or "")) + + +def _delta_top(entries: list[dict[str, Any]], key: str, limit: int = _DELTA_TOP_LIMIT) -> list[dict[str, Any]]: + output: list[dict[str, Any]] = [] + for entry in entries: + if not isinstance(entry, dict): + continue + deltas = entry.get("baseline_delta") if isinstance(entry.get("baseline_delta"), dict) else {} + delta = deltas.get(key) + if not isinstance(delta, (int, float)): + continue + label_key, label_value = _delta_entry_label(entry) + output.append( + { + label_key: label_value, + "metric": key, + "delta": delta, + "severity": _delta_severity(float(delta)), + } + ) + output.sort(key=lambda item: (-(abs(item.get("delta") or 0)), item.get("metric") or "")) + return output[:limit] + + +def _reason_top(counts: dict[str, Any], limit: int = _REASON_TOP_LIMIT) -> list[dict[str, Any]]: + output: list[dict[str, Any]] = [] + for reason, value in counts.items() if isinstance(counts, dict) else []: + if isinstance(reason, str) and reason and isinstance(value, (int, float)): + output.append({"reason": reason, "count": int(value)}) + output.sort(key=lambda item: (-item.get("count", 0), item.get("reason") or "")) + return output[:limit] + + +def _pod_issue_summary(pod_issues: dict[str, Any], metrics: dict[str, Any]) -> dict[str, Any]: + waiting = pod_issues.get("waiting_reasons") if isinstance(pod_issues, dict) else {} + phase = pod_issues.get("phase_reasons") if isinstance(pod_issues, dict) else {} + return { + "waiting_reasons_top": _reason_top(waiting), + "phase_reasons_top": _reason_top(phase), + "namespace_issue_top": metrics.get("namespace_issue_top") or {}, + } + + def _delta_hit(delta: Any) -> bool: if not isinstance(delta, (int, float)): return False @@ -3546,9 +3595,13 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]: }, "pressure_summary": pressure_summary, "trend_summary": metrics.get("trend_summary"), + "trend_requests": metrics.get("namespace_request_trends"), "time_series": metrics.get("cluster_trends"), "node_condition_trends": metrics.get("node_condition_trends"), + "pod_waiting_trends": metrics.get("pod_waiting_trends"), + "pod_terminated_trends": metrics.get("pod_terminated_trends"), "pod_reason_totals": metrics.get("pod_reason_totals"), + "pod_issue_summary": _pod_issue_summary(pod_issues, metrics), "offenders": _build_offenders(metrics), "alerts": metrics.get("alerts", {}), "top": { @@ -3573,6 +3626,19 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]: "workloads": _workload_index(workloads), "namespaces": _namespace_nodes_top(namespace_context, 12), }, + "baseline_deltas": { + "nodes": { + "cpu": _delta_top(node_context, "cpu"), + "ram": _delta_top(node_context, "ram"), + "net": _delta_top(node_context, "net"), + "io": _delta_top(node_context, "io"), + "disk": _delta_top(node_context, "disk"), + }, + "namespaces": { + "cpu": _delta_top(namespace_context, "cpu"), + "mem": _delta_top(namespace_context, "mem"), + }, + }, "attention_ranked": _build_attention_ranked(metrics, node_context, pod_issues, workload_health), "signals": signals, "profiles": profiles,