cluster-state: add baseline deltas and relationships

This commit is contained in:
Brad Stein 2026-01-30 10:20:35 -03:00
parent 9276d2538a
commit 8446c1f032

View File

@ -1948,6 +1948,7 @@ def _namespace_context(
namespace_pods: list[dict[str, Any]], namespace_pods: list[dict[str, Any]],
namespace_nodes: list[dict[str, Any]], namespace_nodes: list[dict[str, Any]],
namespace_capacity: list[dict[str, Any]], namespace_capacity: list[dict[str, Any]],
namespace_baseline: dict[str, dict[str, dict[str, float]]],
) -> list[dict[str, Any]]: ) -> list[dict[str, Any]]:
node_map = {entry.get("namespace"): entry for entry in namespace_nodes if isinstance(entry, dict)} node_map = {entry.get("namespace"): entry for entry in namespace_nodes if isinstance(entry, dict)}
cap_map = {entry.get("namespace"): entry for entry in namespace_capacity if isinstance(entry, dict)} cap_map = {entry.get("namespace"): entry for entry in namespace_capacity if isinstance(entry, dict)}
@ -1967,6 +1968,10 @@ def _namespace_context(
{"node": name, "pods": count} {"node": name, "pods": count}
for name, count in sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[:3] for name, count in sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[:3]
] ]
baseline = namespace_baseline.get(namespace, {}) if isinstance(namespace_baseline, dict) else {}
delta_cpu = _baseline_delta(cap_entry.get("cpu_usage"), baseline.get("cpu", {}))
delta_mem = _baseline_delta(cap_entry.get("mem_usage"), baseline.get("mem", {}))
baseline_delta = {k: v for k, v in (("cpu", delta_cpu), ("mem", delta_mem)) if v is not None}
output.append( output.append(
{ {
"namespace": namespace, "namespace": namespace,
@ -1983,12 +1988,49 @@ def _namespace_context(
"mem_usage": cap_entry.get("mem_usage"), "mem_usage": cap_entry.get("mem_usage"),
"mem_requests": cap_entry.get("mem_requests"), "mem_requests": cap_entry.get("mem_requests"),
"mem_ratio": cap_entry.get("mem_usage_ratio"), "mem_ratio": cap_entry.get("mem_usage_ratio"),
"baseline_delta": baseline_delta,
} }
) )
output.sort(key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or "")) output.sort(key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or ""))
return output return output
def _namespace_nodes_top(namespace_context: list[dict[str, Any]], limit: int = 5) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for entry in namespace_context[:limit]:
if not isinstance(entry, dict):
continue
output.append(
{
"namespace": entry.get("namespace"),
"pods_total": entry.get("pods_total"),
"primary_node": entry.get("primary_node"),
"nodes_top": entry.get("nodes_top") or [],
}
)
return output
def _workload_nodes_top(workloads: list[dict[str, Any]], limit: int = 5) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
entries = [w for w in workloads if isinstance(w, dict)]
entries.sort(
key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or "", item.get("workload") or ""),
)
for entry in entries[:limit]:
output.append(
{
"namespace": entry.get("namespace"),
"workload": entry.get("workload"),
"source": entry.get("source"),
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"primary_node": entry.get("primary_node"),
}
)
return output
def _node_context( def _node_context(
node_details: list[dict[str, Any]], node_details: list[dict[str, Any]],
node_load: list[dict[str, Any]], node_load: list[dict[str, Any]],
@ -2004,6 +2046,13 @@ def _node_context(
continue continue
load_entry = load_map.get(name, {}) load_entry = load_map.get(name, {})
baseline = node_baseline.get(name, {}) if isinstance(node_baseline, dict) else {} baseline = node_baseline.get(name, {}) if isinstance(node_baseline, dict) else {}
deltas: dict[str, float] = {}
for key in ("cpu", "ram", "net", "io", "disk"):
current = load_entry.get(key)
stats = baseline.get(key, {}) if isinstance(baseline, dict) else {}
delta = _baseline_delta(current, stats)
if delta is not None:
deltas[key] = delta
output.append( output.append(
{ {
"node": name, "node": name,
@ -2024,12 +2073,22 @@ def _node_context(
"io": load_entry.get("io"), "io": load_entry.get("io"),
"load_index": load_entry.get("load_index"), "load_index": load_entry.get("load_index"),
"baseline": baseline, "baseline": baseline,
"baseline_delta": deltas,
} }
) )
output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("node") or "")) output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("node") or ""))
return output return output
def _baseline_delta(current: Any, stats: dict[str, Any]) -> float | None:
if not isinstance(current, (int, float)):
return None
avg = stats.get("avg")
if not isinstance(avg, (int, float)) or avg == 0:
return None
return round(((float(current) - float(avg)) / float(avg)) * 100, 2)
def _build_anomalies( def _build_anomalies(
metrics: dict[str, Any], metrics: dict[str, Any],
nodes_summary: dict[str, Any], nodes_summary: dict[str, Any],
@ -2401,6 +2460,7 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
namespace_pods, namespace_pods,
namespace_nodes, namespace_nodes,
metrics.get("namespace_capacity", []), metrics.get("namespace_capacity", []),
metrics.get("namespace_baseline_map", {}),
) )
node_context = _node_context( node_context = _node_context(
node_details, node_details,
@ -2431,6 +2491,11 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
"workload_not_ready": _workload_not_ready_items(workload_health)[:5], "workload_not_ready": _workload_not_ready_items(workload_health)[:5],
"pod_restarts": _pod_restarts_top(metrics), "pod_restarts": _pod_restarts_top(metrics),
}, },
"relationships": {
"namespace_nodes": _namespace_nodes_top(namespace_context, 5),
"node_namespaces": metrics.get("node_pods_top", []),
"workload_nodes": _workload_nodes_top(workloads, 5),
},
"attention_ranked": _build_attention_ranked(metrics, node_context, pod_issues, workload_health), "attention_ranked": _build_attention_ranked(metrics, node_context, pod_issues, workload_health),
"anomalies": anomalies, "anomalies": anomalies,
"health_bullets": _health_bullets(metrics, node_summary, workload_health, anomalies), "health_bullets": _health_bullets(metrics, node_summary, workload_health, anomalies),