snapshot: add node load and namespace capacity

This commit is contained in:
Brad Stein 2026-01-29 13:05:04 -03:00
parent 29575baeb0
commit fc2a482df1

View File

@ -487,11 +487,20 @@ def _summarize_events(payload: dict[str, Any]) -> dict[str, Any]:
by_namespace[namespace] = by_namespace.get(namespace, 0) + count
warnings.sort(key=lambda item: _event_sort_key(item.get("last_seen") or ""), reverse=True)
top = warnings[:_EVENTS_MAX]
top_reason = ""
top_reason_count = 0
if by_reason:
top_reason, top_reason_count = sorted(
by_reason.items(), key=lambda item: (-item[1], item[0])
)[0]
latest_warning = top[0] if top else None
return {
"warnings_total": len(warnings),
"warnings_by_reason": by_reason,
"warnings_by_namespace": by_namespace,
"warnings_recent": top,
"warnings_top_reason": {"reason": top_reason, "count": top_reason_count},
"warnings_latest": latest_warning,
}
@ -712,6 +721,22 @@ def _node_pod_finalize(nodes: dict[str, dict[str, Any]]) -> list[dict[str, Any]]
return output
def _node_pods_top(node_pods: list[dict[str, Any]], limit: int = 5) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for entry in node_pods[:limit]:
if not isinstance(entry, dict):
continue
output.append(
{
"node": entry.get("node"),
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"namespaces_top": entry.get("namespaces_top") or [],
}
)
return output
def _record_pending_pod(
pending_oldest: list[dict[str, Any]],
info: dict[str, Any],
@ -1343,6 +1368,128 @@ def _usage_stats(series: list[dict[str, Any]]) -> dict[str, float]:
}
def _vm_namespace_totals(expr: str) -> dict[str, float]:
totals: dict[str, float] = {}
for item in _vm_vector(expr):
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
if not isinstance(namespace, str) or not namespace:
continue
try:
totals[namespace] = float(item.get("value"))
except (TypeError, ValueError):
continue
return totals
def _build_namespace_capacity(
cpu_usage: dict[str, float],
cpu_requests: dict[str, float],
mem_usage: dict[str, float],
mem_requests: dict[str, float],
) -> list[dict[str, Any]]:
namespaces = sorted(set(cpu_usage) | set(cpu_requests) | set(mem_usage) | set(mem_requests))
output: list[dict[str, Any]] = []
for namespace in namespaces:
cpu_used = cpu_usage.get(namespace)
cpu_req = cpu_requests.get(namespace)
mem_used = mem_usage.get(namespace)
mem_req = mem_requests.get(namespace)
cpu_ratio = None
mem_ratio = None
if isinstance(cpu_used, (int, float)) and isinstance(cpu_req, (int, float)) and cpu_req > 0:
cpu_ratio = cpu_used / cpu_req
if isinstance(mem_used, (int, float)) and isinstance(mem_req, (int, float)) and mem_req > 0:
mem_ratio = mem_used / mem_req
output.append(
{
"namespace": namespace,
"cpu_usage": cpu_used,
"cpu_requests": cpu_req,
"cpu_usage_ratio": cpu_ratio,
"mem_usage": mem_used,
"mem_requests": mem_req,
"mem_usage_ratio": mem_ratio,
}
)
output.sort(
key=lambda item: (
-(item.get("cpu_requests") or 0),
-(item.get("mem_requests") or 0),
item.get("namespace") or "",
)
)
return output
def _node_usage_profile(
node_usage: dict[str, list[dict[str, Any]]],
node_details: list[dict[str, Any]],
node_pods: list[dict[str, Any]],
) -> list[dict[str, Any]]:
usage: dict[str, dict[str, Any]] = {}
for key in ("cpu", "ram", "disk", "net", "io"):
for item in node_usage.get(key, []) or []:
node = item.get("node")
value = item.get("value")
if not isinstance(node, str) or not node:
continue
if not isinstance(value, (int, float)):
continue
usage.setdefault(node, {})[key] = float(value)
max_values: dict[str, float] = {}
for key in ("cpu", "ram", "disk", "net", "io"):
values = [entry.get(key) for entry in usage.values() if isinstance(entry.get(key), (int, float))]
max_values[key] = max(values) if values else 0.0
detail_map: dict[str, dict[str, Any]] = {
entry.get("name"): entry for entry in node_details if isinstance(entry, dict)
}
pod_map: dict[str, dict[str, Any]] = {
entry.get("node"): entry for entry in node_pods if isinstance(entry, dict)
}
output: list[dict[str, Any]] = []
for node, entry in usage.items():
detail = detail_map.get(node, {})
pressure = detail.get("pressure") if isinstance(detail.get("pressure"), dict) else {}
pressure_count = sum(1 for value in pressure.values() if value)
taints = detail.get("taints") if isinstance(detail.get("taints"), list) else []
unschedulable = bool(detail.get("unschedulable"))
pods_total = None
pod_entry = pod_map.get(node)
if isinstance(pod_entry, dict):
pods_total = pod_entry.get("pods_total")
normalized: dict[str, float] = {}
for key in ("cpu", "ram", "disk", "net", "io"):
raw = entry.get(key)
max_val = max_values.get(key) or 0.0
if isinstance(raw, (int, float)) and max_val > 0:
normalized[f"{key}_norm"] = raw / max_val
norm_values = [v for v in normalized.values() if isinstance(v, (int, float))]
load_index = sum(norm_values) / len(norm_values) if norm_values else None
output.append(
{
"node": node,
"cpu": entry.get("cpu"),
"ram": entry.get("ram"),
"disk": entry.get("disk"),
"net": entry.get("net"),
"io": entry.get("io"),
**normalized,
"pressure_flags": pressure,
"pressure_count": pressure_count,
"taints": taints,
"unschedulable": unschedulable,
"pods_total": pods_total,
"load_index": load_index,
}
)
output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("node") or ""))
return output
def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
metrics: dict[str, Any] = {}
try:
@ -1434,6 +1581,24 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
f"topk(5, sum by (namespace) (rate(container_fs_reads_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}]) + rate(container_fs_writes_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}])))"
)
)
namespace_cpu_usage = _vm_namespace_totals(
f'sum by (namespace) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]))'
)
namespace_cpu_requests = _vm_namespace_totals(
"sum by (namespace) (kube_pod_container_resource_requests_cpu_cores)"
)
namespace_mem_usage = _vm_namespace_totals(
'sum by (namespace) (container_memory_working_set_bytes{namespace!=""})'
)
namespace_mem_requests = _vm_namespace_totals(
"sum by (namespace) (kube_pod_container_resource_requests_memory_bytes)"
)
metrics["namespace_capacity"] = _build_namespace_capacity(
namespace_cpu_usage,
namespace_cpu_requests,
namespace_mem_usage,
namespace_mem_requests,
)
except Exception as exc:
errors.append(f"namespace_usage: {exc}")
metrics["pvc_usage_top"] = _pvc_usage(errors)
@ -1483,6 +1648,12 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
events = _fetch_events(errors)
metrics = _summarize_metrics(errors)
metrics["node_pods_top"] = _node_pods_top(node_pods)
metrics["node_load"] = _node_usage_profile(
metrics.get("node_usage", {}),
node_details,
node_pods,
)
snapshot = {
"collected_at": collected_at.isoformat(),