cluster-state: enrich snapshot substrate

2026-01-30 00:09:53 -03:00 · 2026-01-30 00:09:53 -03:00 · 56ea582c97
commit 56ea582c97
parent 3d21506ff0
2 changed files with 327 additions and 0 deletions
--- a/ariadne/services/cluster_state.py
+++ b/ariadne/services/cluster_state.py
@ -1764,6 +1764,12 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
    metrics["namespace_capacity_summary"] = _namespace_capacity_summary(
        metrics.get("namespace_capacity", []),
    )
+    metrics["namespace_totals"] = {
+        "cpu": _namespace_totals_list(namespace_cpu_usage),
+        "mem": _namespace_totals_list(namespace_mem_usage),
+        "cpu_requests": _namespace_totals_list(namespace_cpu_requests),
+        "mem_requests": _namespace_totals_list(namespace_mem_requests),
+    }
    metrics["pvc_usage_top"] = _pvc_usage(errors)
    metrics["units"] = {
        "cpu": "percent",
@ -1798,6 +1804,273 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
    return metrics


+def _namespace_totals_list(totals: dict[str, float]) -> list[dict[str, Any]]:
+    entries = [
+        {"namespace": name, "value": value}
+        for name, value in totals.items()
+        if isinstance(name, str) and name
+    ]
+    entries.sort(key=lambda item: (-(item.get("value") or 0), item.get("namespace") or ""))
+    return entries
+
+
+def _vector_to_named(entries: list[dict[str, Any]], label_key: str, name_key: str) -> list[dict[str, Any]]:
+    output: list[dict[str, Any]] = []
+    for item in entries:
+        if not isinstance(item, dict):
+            continue
+        metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
+        value = item.get("value")
+        label = metric.get(label_key) if isinstance(metric, dict) else None
+        if not isinstance(label, str) or not label:
+            continue
+        output.append({name_key: label, "value": value, "metric": metric})
+    output.sort(key=lambda item: (-(item.get("value") or 0), item.get(name_key) or ""))
+    return output
+
+
+def _pvc_top(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    output: list[dict[str, Any]] = []
+    for item in entries:
+        metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
+        namespace = metric.get("namespace")
+        pvc = metric.get("persistentvolumeclaim")
+        if not isinstance(namespace, str) or not isinstance(pvc, str):
+            continue
+        output.append(
+            {
+                "namespace": namespace,
+                "pvc": pvc,
+                "used_percent": item.get("value"),
+            }
+        )
+    output.sort(key=lambda item: (-(item.get("used_percent") or 0), item.get("namespace") or ""))
+    return output
+
+
+def _namespace_context(
+    namespace_pods: list[dict[str, Any]],
+    namespace_nodes: list[dict[str, Any]],
+    namespace_capacity: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    node_map = {entry.get("namespace"): entry for entry in namespace_nodes if isinstance(entry, dict)}
+    cap_map = {entry.get("namespace"): entry for entry in namespace_capacity if isinstance(entry, dict)}
+    output: list[dict[str, Any]] = []
+    for entry in namespace_pods:
+        if not isinstance(entry, dict):
+            continue
+        namespace = entry.get("namespace")
+        if not isinstance(namespace, str) or not namespace:
+            continue
+        nodes_entry = node_map.get(namespace, {})
+        cap_entry = cap_map.get(namespace, {})
+        nodes = nodes_entry.get("nodes") if isinstance(nodes_entry.get("nodes"), dict) else {}
+        top_nodes: list[dict[str, Any]] = []
+        if isinstance(nodes, dict):
+            top_nodes = [
+                {"node": name, "pods": count}
+                for name, count in sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[:3]
+            ]
+        output.append(
+            {
+                "namespace": namespace,
+                "pods_total": entry.get("pods_total"),
+                "pods_running": entry.get("pods_running"),
+                "pods_pending": entry.get("pods_pending"),
+                "pods_failed": entry.get("pods_failed"),
+                "pods_succeeded": entry.get("pods_succeeded"),
+                "primary_node": nodes_entry.get("primary_node"),
+                "nodes_top": top_nodes,
+                "cpu_usage": cap_entry.get("cpu_usage"),
+                "cpu_requests": cap_entry.get("cpu_requests"),
+                "cpu_ratio": cap_entry.get("cpu_usage_ratio"),
+                "mem_usage": cap_entry.get("mem_usage"),
+                "mem_requests": cap_entry.get("mem_requests"),
+                "mem_ratio": cap_entry.get("mem_usage_ratio"),
+            }
+        )
+    output.sort(key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or ""))
+    return output
+
+
+def _node_context(
+    node_details: list[dict[str, Any]],
+    node_load: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    load_map = {entry.get("node"): entry for entry in node_load if isinstance(entry, dict)}
+    output: list[dict[str, Any]] = []
+    for entry in node_details:
+        if not isinstance(entry, dict):
+            continue
+        name = entry.get("name")
+        if not isinstance(name, str) or not name:
+            continue
+        load_entry = load_map.get(name, {})
+        output.append(
+            {
+                "node": name,
+                "ready": entry.get("ready"),
+                "roles": entry.get("roles"),
+                "is_worker": entry.get("is_worker"),
+                "hardware": entry.get("hardware"),
+                "arch": entry.get("arch"),
+                "os": entry.get("os"),
+                "taints": entry.get("taints"),
+                "unschedulable": entry.get("unschedulable"),
+                "pressure_flags": entry.get("pressure"),
+                "pods_total": load_entry.get("pods_total"),
+                "cpu": load_entry.get("cpu"),
+                "ram": load_entry.get("ram"),
+                "disk": load_entry.get("disk"),
+                "net": load_entry.get("net"),
+                "io": load_entry.get("io"),
+                "load_index": load_entry.get("load_index"),
+            }
+        )
+    output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("node") or ""))
+    return output
+
+
+def _build_anomalies(
+    metrics: dict[str, Any],
+    nodes_summary: dict[str, Any],
+    workloads_health: dict[str, Any],
+    kustomizations: dict[str, Any],
+    events: dict[str, Any],
+) -> list[dict[str, Any]]:
+    anomalies: list[dict[str, Any]] = []
+    pods_pending = metrics.get("pods_pending") or 0
+    pods_failed = metrics.get("pods_failed") or 0
+    if pods_pending:
+        anomalies.append(
+            {
+                "kind": "pods_pending",
+                "severity": "warning",
+                "summary": f"{int(pods_pending)} pods pending",
+            }
+        )
+    if pods_failed:
+        anomalies.append(
+            {
+                "kind": "pods_failed",
+                "severity": "critical",
+                "summary": f"{int(pods_failed)} pods failed",
+            }
+        )
+    for key in ("deployments", "statefulsets", "daemonsets"):
+        entry = workloads_health.get(key) if isinstance(workloads_health.get(key), dict) else {}
+        not_ready = entry.get("not_ready") or 0
+        if not_ready:
+            anomalies.append(
+                {
+                    "kind": f"{key}_not_ready",
+                    "severity": "warning",
+                    "summary": f"{int(not_ready)} {key} not ready",
+                    "items": entry.get("items"),
+                }
+            )
+    flux_not_ready = (kustomizations or {}).get("not_ready") or 0
+    if flux_not_ready:
+        anomalies.append(
+            {
+                "kind": "flux_not_ready",
+                "severity": "warning",
+                "summary": f"{int(flux_not_ready)} Flux kustomizations not ready",
+                "items": (kustomizations or {}).get("items"),
+            }
+        )
+    job_failures = metrics.get("job_failures_24h") or []
+    job_failures = [
+        entry for entry in job_failures if isinstance(entry, dict) and (entry.get("value") or 0) > 0
+    ]
+    if job_failures:
+        anomalies.append(
+            {
+                "kind": "job_failures_24h",
+                "severity": "warning",
+                "summary": "Job failures in last 24h",
+                "items": job_failures[:5],
+            }
+        )
+    pvc_top = _pvc_top(metrics.get("pvc_usage_top") or [])
+    pvc_pressure = [entry for entry in pvc_top if (entry.get("used_percent") or 0) >= 80]
+    if pvc_pressure:
+        anomalies.append(
+            {
+                "kind": "pvc_pressure",
+                "severity": "warning",
+                "summary": "PVCs above 80% usage",
+                "items": pvc_pressure[:5],
+            }
+        )
+    if nodes_summary:
+        pressure_nodes = nodes_summary.get("pressure_nodes") or {}
+        flagged = [
+            name for names in pressure_nodes.values() if isinstance(names, list) for name in names if name
+        ]
+        if flagged:
+            anomalies.append(
+                {
+                    "kind": "node_pressure",
+                    "severity": "warning",
+                    "summary": f"{len(flagged)} nodes report pressure",
+                    "items": sorted(set(flagged)),
+                }
+            )
+        unschedulable = nodes_summary.get("unschedulable_nodes") or []
+        if unschedulable:
+            anomalies.append(
+                {
+                    "kind": "unschedulable_nodes",
+                    "severity": "info",
+                    "summary": f"{len(unschedulable)} nodes unschedulable",
+                    "items": unschedulable,
+                }
+            )
+    if events:
+        warnings = events.get("warnings_total") or 0
+        if warnings:
+            anomalies.append(
+                {
+                    "kind": "event_warnings",
+                    "severity": "info",
+                    "summary": f"{int(warnings)} warning events",
+                    "items": events.get("warnings") or [],
+                }
+            )
+    return anomalies
+
+
+def _health_bullets(
+    metrics: dict[str, Any],
+    nodes_summary: dict[str, Any],
+    workloads_health: dict[str, Any],
+    anomalies: list[dict[str, Any]],
+) -> list[str]:
+    bullets: list[str] = []
+    nodes_total = metrics.get("nodes_total")
+    nodes_ready = metrics.get("nodes_ready")
+    if nodes_total is not None and nodes_ready is not None:
+        bullets.append(f"Nodes ready: {int(nodes_ready)}/{int(nodes_total)}")
+    pods_running = metrics.get("pods_running") or 0
+    pods_pending = metrics.get("pods_pending") or 0
+    pods_failed = metrics.get("pods_failed") or 0
+    bullets.append(f"Pods: {int(pods_running)} running, {int(pods_pending)} pending, {int(pods_failed)} failed")
+    not_ready = 0
+    for key in ("deployments", "statefulsets", "daemonsets"):
+        entry = workloads_health.get(key) if isinstance(workloads_health.get(key), dict) else {}
+        not_ready += int(entry.get("not_ready") or 0)
+    if not_ready:
+        bullets.append(f"Workloads not ready: {not_ready}")
+    else:
+        bullets.append("Workloads: all ready")
+    if anomalies:
+        top = anomalies[0].get("summary") if isinstance(anomalies[0], dict) else None
+        if isinstance(top, str) and top:
+            bullets.append(f"Top concern: {top}")
+    return bullets[:4]
+
+
 def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
    errors: list[str] = []
    collected_at = datetime.now(timezone.utc)
@ -1819,9 +2092,55 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
    )
    metrics["node_load_summary"] = _node_load_summary(metrics.get("node_load", []))
    metrics["node_load_by_hardware"] = _node_usage_by_hardware(metrics.get("node_load", []), node_details)
+    metrics["namespace_top"] = {
+        "cpu": _vector_to_named(metrics.get("namespace_cpu_top", []), "namespace", "namespace"),
+        "mem": _vector_to_named(metrics.get("namespace_mem_top", []), "namespace", "namespace"),
+        "net": _vector_to_named(metrics.get("namespace_net_top", []), "namespace", "namespace"),
+        "io": _vector_to_named(metrics.get("namespace_io_top", []), "namespace", "namespace"),
+        "restarts": _vector_to_named(metrics.get("restart_namespace_top", []), "namespace", "namespace"),
+    }
+
+    anomalies = _build_anomalies(metrics, node_summary, workload_health, kustomizations, events)
+    namespace_context = _namespace_context(
+        namespace_pods,
+        namespace_nodes,
+        metrics.get("namespace_capacity", []),
+    )
+    node_context = _node_context(node_details, metrics.get("node_load", []))
+    summary = {
+        "generated_at": collected_at.isoformat(),
+        "windows": metrics.get("windows", {}),
+        "counts": {
+            "nodes_total": metrics.get("nodes_total"),
+            "nodes_ready": metrics.get("nodes_ready"),
+            "pods_running": metrics.get("pods_running"),
+            "pods_pending": metrics.get("pods_pending"),
+            "pods_failed": metrics.get("pods_failed"),
+            "pods_succeeded": metrics.get("pods_succeeded"),
+        },
+        "top": {
+            "namespace_cpu": (metrics.get("namespace_totals", {}) or {}).get("cpu", [])[:5],
+            "namespace_mem": (metrics.get("namespace_totals", {}) or {}).get("mem", [])[:5],
+            "namespace_pods": namespace_pods[:5],
+            "node_pods": metrics.get("node_pods_top", []),
+            "node_load": metrics.get("node_load_summary", {}).get("top", []),
+            "node_hottest": metrics.get("hottest_nodes", {}),
+            "postgres": metrics.get("postgres_connections", {}),
+            "pvc_usage": _pvc_top(metrics.get("pvc_usage_top", [])),
+        },
+        "anomalies": anomalies,
+        "health_bullets": _health_bullets(metrics, node_summary, workload_health, anomalies),
+        "unknowns": errors,
+    }

    snapshot = {
        "collected_at": collected_at.isoformat(),
+        "snapshot_version": "v2",
+        "summary": summary,
+        "context": {
+            "nodes": node_context,
+            "namespaces": namespace_context,
+        },
        "nodes": nodes,
        "nodes_summary": node_summary,
        "nodes_detail": node_details,
--- a/tests/test_cluster_state.py
+++ b/tests/test_cluster_state.py
@ -143,6 +143,14 @@ def test_collect_cluster_state(monkeypatch) -> None:
    assert snapshot["metrics"]["pod_mem_top"] == []
    assert snapshot["metrics"]["job_failures_24h"] == []
    assert snapshot["metrics"]["pvc_usage_top"] == []
+    assert snapshot["summary"]["counts"]["nodes_total"] == 5.0
+    assert snapshot["summary"]["counts"]["nodes_ready"] == 5.0
+    assert snapshot["summary"]["counts"]["pods_running"] == 5.0
+    assert snapshot["summary"]["top"]["namespace_pods"][0]["namespace"] == "media"
+    assert snapshot["summary"]["health_bullets"]
+    assert snapshot["summary"]["unknowns"] == []
+    assert snapshot["context"]["nodes"]
+    assert snapshot["context"]["namespaces"]
    assert summary.nodes_total == 2
    assert summary.nodes_ready == 1
    assert summary.pods_running == 5.0