diff --git a/ariadne/services/cluster_state.py b/ariadne/services/cluster_state.py index cd18306..7f9b05f 100644 --- a/ariadne/services/cluster_state.py +++ b/ariadne/services/cluster_state.py @@ -62,6 +62,8 @@ _PHASE_SEVERITY = { "Unknown": 1, } _PENDING_15M_HOURS = 0.25 +_LOAD_TOP_COUNT = 5 +_NAMESPACE_TOP_COUNT = 5 @dataclass(frozen=True) @@ -1490,6 +1492,105 @@ def _node_usage_profile( return output +def _percentile(values: list[float], percentile: float) -> float | None: + if not values: + return None + ordered = sorted(values) + idx = int(round((len(ordered) - 1) * percentile)) + idx = min(max(idx, 0), len(ordered) - 1) + return ordered[idx] + + +def _node_load_summary(node_load: list[dict[str, Any]]) -> dict[str, Any]: + items = [ + entry + for entry in node_load + if isinstance(entry, dict) and isinstance(entry.get("load_index"), (int, float)) + ] + if not items: + return {} + values = [float(entry.get("load_index") or 0) for entry in items] + avg = sum(values) / len(values) + variance = sum((value - avg) ** 2 for value in values) / len(values) + stddev = variance**0.5 + top = sorted(items, key=lambda item: -(item.get("load_index") or 0))[:_LOAD_TOP_COUNT] + bottom = sorted(items, key=lambda item: (item.get("load_index") or 0))[:_LOAD_TOP_COUNT] + outliers = [ + item + for item in items + if isinstance(item.get("load_index"), (int, float)) + and item.get("load_index") >= avg + stddev + ] + outliers.sort(key=lambda item: -(item.get("load_index") or 0)) + return { + "avg": round(avg, 3), + "p90": round(_percentile(values, 0.9) or 0.0, 3), + "min": round(min(values), 3), + "max": round(max(values), 3), + "top": top, + "bottom": bottom, + "outliers": outliers[:_LOAD_TOP_COUNT], + } + + +def _namespace_capacity_summary(capacity: list[dict[str, Any]]) -> dict[str, Any]: + if not capacity: + return {} + cpu_ratio = [ + entry + for entry in capacity + if isinstance(entry, dict) and isinstance(entry.get("cpu_usage_ratio"), (int, float)) + ] + mem_ratio = [ + entry + for entry in capacity + if isinstance(entry, dict) and isinstance(entry.get("mem_usage_ratio"), (int, float)) + ] + cpu_ratio.sort(key=lambda item: -(item.get("cpu_usage_ratio") or 0)) + mem_ratio.sort(key=lambda item: -(item.get("mem_usage_ratio") or 0)) + cpu_headroom: list[dict[str, Any]] = [] + mem_headroom: list[dict[str, Any]] = [] + for entry in capacity: + if not isinstance(entry, dict): + continue + cpu_used = entry.get("cpu_usage") + cpu_req = entry.get("cpu_requests") + mem_used = entry.get("mem_usage") + mem_req = entry.get("mem_requests") + if isinstance(cpu_used, (int, float)) and isinstance(cpu_req, (int, float)): + cpu_headroom.append( + { + "namespace": entry.get("namespace"), + "headroom": cpu_req - cpu_used, + "usage": cpu_used, + "requests": cpu_req, + "ratio": entry.get("cpu_usage_ratio"), + } + ) + if isinstance(mem_used, (int, float)) and isinstance(mem_req, (int, float)): + mem_headroom.append( + { + "namespace": entry.get("namespace"), + "headroom": mem_req - mem_used, + "usage": mem_used, + "requests": mem_req, + "ratio": entry.get("mem_usage_ratio"), + } + ) + cpu_headroom.sort(key=lambda item: (item.get("headroom") or 0)) + mem_headroom.sort(key=lambda item: (item.get("headroom") or 0)) + over_cpu = sum(1 for entry in cpu_ratio if (entry.get("cpu_usage_ratio") or 0) > 1) + over_mem = sum(1 for entry in mem_ratio if (entry.get("mem_usage_ratio") or 0) > 1) + return { + "cpu_ratio_top": cpu_ratio[:_NAMESPACE_TOP_COUNT], + "mem_ratio_top": mem_ratio[:_NAMESPACE_TOP_COUNT], + "cpu_headroom_low": cpu_headroom[:_NAMESPACE_TOP_COUNT], + "mem_headroom_low": mem_headroom[:_NAMESPACE_TOP_COUNT], + "cpu_overcommitted": over_cpu, + "mem_overcommitted": over_mem, + } + + def _summarize_metrics(errors: list[str]) -> dict[str, Any]: metrics: dict[str, Any] = {} try: @@ -1601,6 +1702,9 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]: ) except Exception as exc: errors.append(f"namespace_usage: {exc}") + metrics["namespace_capacity_summary"] = _namespace_capacity_summary( + metrics.get("namespace_capacity", []), + ) metrics["pvc_usage_top"] = _pvc_usage(errors) metrics["units"] = { "cpu": "percent", @@ -1654,6 +1758,7 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]: node_details, node_pods, ) + metrics["node_load_summary"] = _node_load_summary(metrics.get("node_load", [])) snapshot = { "collected_at": collected_at.isoformat(),