snapshot: add load and namespace summaries
This commit is contained in:
parent
fc2a482df1
commit
ccf89bc2e7
@ -62,6 +62,8 @@ _PHASE_SEVERITY = {
|
|||||||
"Unknown": 1,
|
"Unknown": 1,
|
||||||
}
|
}
|
||||||
_PENDING_15M_HOURS = 0.25
|
_PENDING_15M_HOURS = 0.25
|
||||||
|
_LOAD_TOP_COUNT = 5
|
||||||
|
_NAMESPACE_TOP_COUNT = 5
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
@ -1490,6 +1492,105 @@ def _node_usage_profile(
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def _percentile(values: list[float], percentile: float) -> float | None:
|
||||||
|
if not values:
|
||||||
|
return None
|
||||||
|
ordered = sorted(values)
|
||||||
|
idx = int(round((len(ordered) - 1) * percentile))
|
||||||
|
idx = min(max(idx, 0), len(ordered) - 1)
|
||||||
|
return ordered[idx]
|
||||||
|
|
||||||
|
|
||||||
|
def _node_load_summary(node_load: list[dict[str, Any]]) -> dict[str, Any]:
|
||||||
|
items = [
|
||||||
|
entry
|
||||||
|
for entry in node_load
|
||||||
|
if isinstance(entry, dict) and isinstance(entry.get("load_index"), (int, float))
|
||||||
|
]
|
||||||
|
if not items:
|
||||||
|
return {}
|
||||||
|
values = [float(entry.get("load_index") or 0) for entry in items]
|
||||||
|
avg = sum(values) / len(values)
|
||||||
|
variance = sum((value - avg) ** 2 for value in values) / len(values)
|
||||||
|
stddev = variance**0.5
|
||||||
|
top = sorted(items, key=lambda item: -(item.get("load_index") or 0))[:_LOAD_TOP_COUNT]
|
||||||
|
bottom = sorted(items, key=lambda item: (item.get("load_index") or 0))[:_LOAD_TOP_COUNT]
|
||||||
|
outliers = [
|
||||||
|
item
|
||||||
|
for item in items
|
||||||
|
if isinstance(item.get("load_index"), (int, float))
|
||||||
|
and item.get("load_index") >= avg + stddev
|
||||||
|
]
|
||||||
|
outliers.sort(key=lambda item: -(item.get("load_index") or 0))
|
||||||
|
return {
|
||||||
|
"avg": round(avg, 3),
|
||||||
|
"p90": round(_percentile(values, 0.9) or 0.0, 3),
|
||||||
|
"min": round(min(values), 3),
|
||||||
|
"max": round(max(values), 3),
|
||||||
|
"top": top,
|
||||||
|
"bottom": bottom,
|
||||||
|
"outliers": outliers[:_LOAD_TOP_COUNT],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _namespace_capacity_summary(capacity: list[dict[str, Any]]) -> dict[str, Any]:
|
||||||
|
if not capacity:
|
||||||
|
return {}
|
||||||
|
cpu_ratio = [
|
||||||
|
entry
|
||||||
|
for entry in capacity
|
||||||
|
if isinstance(entry, dict) and isinstance(entry.get("cpu_usage_ratio"), (int, float))
|
||||||
|
]
|
||||||
|
mem_ratio = [
|
||||||
|
entry
|
||||||
|
for entry in capacity
|
||||||
|
if isinstance(entry, dict) and isinstance(entry.get("mem_usage_ratio"), (int, float))
|
||||||
|
]
|
||||||
|
cpu_ratio.sort(key=lambda item: -(item.get("cpu_usage_ratio") or 0))
|
||||||
|
mem_ratio.sort(key=lambda item: -(item.get("mem_usage_ratio") or 0))
|
||||||
|
cpu_headroom: list[dict[str, Any]] = []
|
||||||
|
mem_headroom: list[dict[str, Any]] = []
|
||||||
|
for entry in capacity:
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
cpu_used = entry.get("cpu_usage")
|
||||||
|
cpu_req = entry.get("cpu_requests")
|
||||||
|
mem_used = entry.get("mem_usage")
|
||||||
|
mem_req = entry.get("mem_requests")
|
||||||
|
if isinstance(cpu_used, (int, float)) and isinstance(cpu_req, (int, float)):
|
||||||
|
cpu_headroom.append(
|
||||||
|
{
|
||||||
|
"namespace": entry.get("namespace"),
|
||||||
|
"headroom": cpu_req - cpu_used,
|
||||||
|
"usage": cpu_used,
|
||||||
|
"requests": cpu_req,
|
||||||
|
"ratio": entry.get("cpu_usage_ratio"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if isinstance(mem_used, (int, float)) and isinstance(mem_req, (int, float)):
|
||||||
|
mem_headroom.append(
|
||||||
|
{
|
||||||
|
"namespace": entry.get("namespace"),
|
||||||
|
"headroom": mem_req - mem_used,
|
||||||
|
"usage": mem_used,
|
||||||
|
"requests": mem_req,
|
||||||
|
"ratio": entry.get("mem_usage_ratio"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
cpu_headroom.sort(key=lambda item: (item.get("headroom") or 0))
|
||||||
|
mem_headroom.sort(key=lambda item: (item.get("headroom") or 0))
|
||||||
|
over_cpu = sum(1 for entry in cpu_ratio if (entry.get("cpu_usage_ratio") or 0) > 1)
|
||||||
|
over_mem = sum(1 for entry in mem_ratio if (entry.get("mem_usage_ratio") or 0) > 1)
|
||||||
|
return {
|
||||||
|
"cpu_ratio_top": cpu_ratio[:_NAMESPACE_TOP_COUNT],
|
||||||
|
"mem_ratio_top": mem_ratio[:_NAMESPACE_TOP_COUNT],
|
||||||
|
"cpu_headroom_low": cpu_headroom[:_NAMESPACE_TOP_COUNT],
|
||||||
|
"mem_headroom_low": mem_headroom[:_NAMESPACE_TOP_COUNT],
|
||||||
|
"cpu_overcommitted": over_cpu,
|
||||||
|
"mem_overcommitted": over_mem,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
||||||
metrics: dict[str, Any] = {}
|
metrics: dict[str, Any] = {}
|
||||||
try:
|
try:
|
||||||
@ -1601,6 +1702,9 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
|||||||
)
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
errors.append(f"namespace_usage: {exc}")
|
errors.append(f"namespace_usage: {exc}")
|
||||||
|
metrics["namespace_capacity_summary"] = _namespace_capacity_summary(
|
||||||
|
metrics.get("namespace_capacity", []),
|
||||||
|
)
|
||||||
metrics["pvc_usage_top"] = _pvc_usage(errors)
|
metrics["pvc_usage_top"] = _pvc_usage(errors)
|
||||||
metrics["units"] = {
|
metrics["units"] = {
|
||||||
"cpu": "percent",
|
"cpu": "percent",
|
||||||
@ -1654,6 +1758,7 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
|
|||||||
node_details,
|
node_details,
|
||||||
node_pods,
|
node_pods,
|
||||||
)
|
)
|
||||||
|
metrics["node_load_summary"] = _node_load_summary(metrics.get("node_load", []))
|
||||||
|
|
||||||
snapshot = {
|
snapshot = {
|
||||||
"collected_at": collected_at.isoformat(),
|
"collected_at": collected_at.isoformat(),
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user