From a5b35848d0195a110a78e9e0cf3927a15a9887ff Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 31 Jan 2026 13:40:05 -0300 Subject: [PATCH] cluster state: add issue hot spots and trends --- ariadne/services/cluster_state.py | 43 +++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/ariadne/services/cluster_state.py b/ariadne/services/cluster_state.py index 05967c0..a481a56 100644 --- a/ariadne/services/cluster_state.py +++ b/ariadne/services/cluster_state.py @@ -89,6 +89,7 @@ _PVC_PRESSURE_THRESHOLD = 80.0 _ALERT_TOP_LIMIT = 10 _POD_REASON_LIMIT = 10 _POD_REASON_TREND_LIMIT = 10 +_NAMESPACE_ISSUE_LIMIT = 8 _POD_TERMINATED_REASONS = { "oom_killed": "OOMKilled", "error": "Error", @@ -1700,6 +1701,12 @@ def _pod_reason_entries(expr: str, limit: int) -> list[dict[str, Any]]: return output +def _namespace_reason_entries(expr: str, limit: int) -> list[dict[str, Any]]: + entries = _vm_vector(f"topk({limit}, sum by (namespace) ({expr}))") + entries = _filter_namespace_vector(entries) + return _vector_to_named(entries, "namespace", "namespace") + + def _pod_waiting_now() -> dict[str, list[dict[str, Any]]]: output: dict[str, list[dict[str, Any]]] = {} for key, reason in _POD_WAITING_REASONS.items(): @@ -2215,6 +2222,40 @@ def _collect_trend_metrics(metrics: dict[str, Any], errors: list[str]) -> None: errors.append(f"trends: {exc}") +def _collect_issue_metrics(metrics: dict[str, Any], errors: list[str]) -> None: + try: + waiting_series = "kube_pod_container_status_waiting_reason" + terminated_series = "kube_pod_container_status_terminated_reason" + metrics["namespace_issue_top"] = { + "crash_loop": _namespace_reason_entries( + f'{waiting_series}{{reason="CrashLoopBackOff"}}', + _NAMESPACE_ISSUE_LIMIT, + ), + "image_pull": _namespace_reason_entries( + f'{waiting_series}{{reason="ImagePullBackOff"}}', + _NAMESPACE_ISSUE_LIMIT, + ), + "err_image_pull": _namespace_reason_entries( + f'{waiting_series}{{reason="ErrImagePull"}}', + _NAMESPACE_ISSUE_LIMIT, + ), + "config_error": _namespace_reason_entries( + f'{waiting_series}{{reason="CreateContainerConfigError"}}', + _NAMESPACE_ISSUE_LIMIT, + ), + "oom_killed": _namespace_reason_entries( + f'{terminated_series}{{reason="OOMKilled"}}', + _NAMESPACE_ISSUE_LIMIT, + ), + "terminated_error": _namespace_reason_entries( + f'{terminated_series}{{reason="Error"}}', + _NAMESPACE_ISSUE_LIMIT, + ), + } + except Exception as exc: + errors.append(f"issues: {exc}") + + def _collect_alert_metrics(metrics: dict[str, Any], errors: list[str]) -> None: try: vm_now = _vm_alerts_now() @@ -2347,6 +2388,7 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]: _collect_trend_metrics(metrics, errors) _collect_alert_metrics(metrics, errors) _collect_namespace_metrics(metrics, errors) + _collect_issue_metrics(metrics, errors) metrics["pvc_usage_top"] = _pvc_usage(errors) metrics["trend_summary"] = _trend_summary(metrics) _finalize_metrics(metrics) @@ -2389,6 +2431,7 @@ def _build_offenders(metrics: dict[str, Any]) -> dict[str, Any]: offenders["pod_terminated_now"] = metrics.get("pod_terminated_now") or {} offenders["job_failures_24h"] = metrics.get("job_failures_24h") or [] offenders["pvc_pressure"] = _pvc_pressure_entries(metrics) + offenders["namespace_issues"] = metrics.get("namespace_issue_top") or {} return offenders