cluster state: add issue hot spots and trends
This commit is contained in:
parent
8571ef6f4d
commit
a5b35848d0
@ -89,6 +89,7 @@ _PVC_PRESSURE_THRESHOLD = 80.0
|
||||
_ALERT_TOP_LIMIT = 10
|
||||
_POD_REASON_LIMIT = 10
|
||||
_POD_REASON_TREND_LIMIT = 10
|
||||
_NAMESPACE_ISSUE_LIMIT = 8
|
||||
_POD_TERMINATED_REASONS = {
|
||||
"oom_killed": "OOMKilled",
|
||||
"error": "Error",
|
||||
@ -1700,6 +1701,12 @@ def _pod_reason_entries(expr: str, limit: int) -> list[dict[str, Any]]:
|
||||
return output
|
||||
|
||||
|
||||
def _namespace_reason_entries(expr: str, limit: int) -> list[dict[str, Any]]:
|
||||
entries = _vm_vector(f"topk({limit}, sum by (namespace) ({expr}))")
|
||||
entries = _filter_namespace_vector(entries)
|
||||
return _vector_to_named(entries, "namespace", "namespace")
|
||||
|
||||
|
||||
def _pod_waiting_now() -> dict[str, list[dict[str, Any]]]:
|
||||
output: dict[str, list[dict[str, Any]]] = {}
|
||||
for key, reason in _POD_WAITING_REASONS.items():
|
||||
@ -2215,6 +2222,40 @@ def _collect_trend_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
|
||||
errors.append(f"trends: {exc}")
|
||||
|
||||
|
||||
def _collect_issue_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
|
||||
try:
|
||||
waiting_series = "kube_pod_container_status_waiting_reason"
|
||||
terminated_series = "kube_pod_container_status_terminated_reason"
|
||||
metrics["namespace_issue_top"] = {
|
||||
"crash_loop": _namespace_reason_entries(
|
||||
f'{waiting_series}{{reason="CrashLoopBackOff"}}',
|
||||
_NAMESPACE_ISSUE_LIMIT,
|
||||
),
|
||||
"image_pull": _namespace_reason_entries(
|
||||
f'{waiting_series}{{reason="ImagePullBackOff"}}',
|
||||
_NAMESPACE_ISSUE_LIMIT,
|
||||
),
|
||||
"err_image_pull": _namespace_reason_entries(
|
||||
f'{waiting_series}{{reason="ErrImagePull"}}',
|
||||
_NAMESPACE_ISSUE_LIMIT,
|
||||
),
|
||||
"config_error": _namespace_reason_entries(
|
||||
f'{waiting_series}{{reason="CreateContainerConfigError"}}',
|
||||
_NAMESPACE_ISSUE_LIMIT,
|
||||
),
|
||||
"oom_killed": _namespace_reason_entries(
|
||||
f'{terminated_series}{{reason="OOMKilled"}}',
|
||||
_NAMESPACE_ISSUE_LIMIT,
|
||||
),
|
||||
"terminated_error": _namespace_reason_entries(
|
||||
f'{terminated_series}{{reason="Error"}}',
|
||||
_NAMESPACE_ISSUE_LIMIT,
|
||||
),
|
||||
}
|
||||
except Exception as exc:
|
||||
errors.append(f"issues: {exc}")
|
||||
|
||||
|
||||
def _collect_alert_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
|
||||
try:
|
||||
vm_now = _vm_alerts_now()
|
||||
@ -2347,6 +2388,7 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
||||
_collect_trend_metrics(metrics, errors)
|
||||
_collect_alert_metrics(metrics, errors)
|
||||
_collect_namespace_metrics(metrics, errors)
|
||||
_collect_issue_metrics(metrics, errors)
|
||||
metrics["pvc_usage_top"] = _pvc_usage(errors)
|
||||
metrics["trend_summary"] = _trend_summary(metrics)
|
||||
_finalize_metrics(metrics)
|
||||
@ -2389,6 +2431,7 @@ def _build_offenders(metrics: dict[str, Any]) -> dict[str, Any]:
|
||||
offenders["pod_terminated_now"] = metrics.get("pod_terminated_now") or {}
|
||||
offenders["job_failures_24h"] = metrics.get("job_failures_24h") or []
|
||||
offenders["pvc_pressure"] = _pvc_pressure_entries(metrics)
|
||||
offenders["namespace_issues"] = metrics.get("namespace_issue_top") or {}
|
||||
return offenders
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user