cluster state: add issue hot spots and trends

This commit is contained in:
Brad Stein 2026-01-31 13:40:05 -03:00
parent 8571ef6f4d
commit a5b35848d0

View File

@ -89,6 +89,7 @@ _PVC_PRESSURE_THRESHOLD = 80.0
_ALERT_TOP_LIMIT = 10
_POD_REASON_LIMIT = 10
_POD_REASON_TREND_LIMIT = 10
_NAMESPACE_ISSUE_LIMIT = 8
_POD_TERMINATED_REASONS = {
"oom_killed": "OOMKilled",
"error": "Error",
@ -1700,6 +1701,12 @@ def _pod_reason_entries(expr: str, limit: int) -> list[dict[str, Any]]:
return output
def _namespace_reason_entries(expr: str, limit: int) -> list[dict[str, Any]]:
entries = _vm_vector(f"topk({limit}, sum by (namespace) ({expr}))")
entries = _filter_namespace_vector(entries)
return _vector_to_named(entries, "namespace", "namespace")
def _pod_waiting_now() -> dict[str, list[dict[str, Any]]]:
output: dict[str, list[dict[str, Any]]] = {}
for key, reason in _POD_WAITING_REASONS.items():
@ -2215,6 +2222,40 @@ def _collect_trend_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
errors.append(f"trends: {exc}")
def _collect_issue_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
try:
waiting_series = "kube_pod_container_status_waiting_reason"
terminated_series = "kube_pod_container_status_terminated_reason"
metrics["namespace_issue_top"] = {
"crash_loop": _namespace_reason_entries(
f'{waiting_series}{{reason="CrashLoopBackOff"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"image_pull": _namespace_reason_entries(
f'{waiting_series}{{reason="ImagePullBackOff"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"err_image_pull": _namespace_reason_entries(
f'{waiting_series}{{reason="ErrImagePull"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"config_error": _namespace_reason_entries(
f'{waiting_series}{{reason="CreateContainerConfigError"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"oom_killed": _namespace_reason_entries(
f'{terminated_series}{{reason="OOMKilled"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"terminated_error": _namespace_reason_entries(
f'{terminated_series}{{reason="Error"}}',
_NAMESPACE_ISSUE_LIMIT,
),
}
except Exception as exc:
errors.append(f"issues: {exc}")
def _collect_alert_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
try:
vm_now = _vm_alerts_now()
@ -2347,6 +2388,7 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
_collect_trend_metrics(metrics, errors)
_collect_alert_metrics(metrics, errors)
_collect_namespace_metrics(metrics, errors)
_collect_issue_metrics(metrics, errors)
metrics["pvc_usage_top"] = _pvc_usage(errors)
metrics["trend_summary"] = _trend_summary(metrics)
_finalize_metrics(metrics)
@ -2389,6 +2431,7 @@ def _build_offenders(metrics: dict[str, Any]) -> dict[str, Any]:
offenders["pod_terminated_now"] = metrics.get("pod_terminated_now") or {}
offenders["job_failures_24h"] = metrics.get("job_failures_24h") or []
offenders["pvc_pressure"] = _pvc_pressure_entries(metrics)
offenders["namespace_issues"] = metrics.get("namespace_issue_top") or {}
return offenders