cluster state: add issue hot spots and trends
This commit is contained in:
parent
8571ef6f4d
commit
a5b35848d0
@ -89,6 +89,7 @@ _PVC_PRESSURE_THRESHOLD = 80.0
|
|||||||
_ALERT_TOP_LIMIT = 10
|
_ALERT_TOP_LIMIT = 10
|
||||||
_POD_REASON_LIMIT = 10
|
_POD_REASON_LIMIT = 10
|
||||||
_POD_REASON_TREND_LIMIT = 10
|
_POD_REASON_TREND_LIMIT = 10
|
||||||
|
_NAMESPACE_ISSUE_LIMIT = 8
|
||||||
_POD_TERMINATED_REASONS = {
|
_POD_TERMINATED_REASONS = {
|
||||||
"oom_killed": "OOMKilled",
|
"oom_killed": "OOMKilled",
|
||||||
"error": "Error",
|
"error": "Error",
|
||||||
@ -1700,6 +1701,12 @@ def _pod_reason_entries(expr: str, limit: int) -> list[dict[str, Any]]:
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def _namespace_reason_entries(expr: str, limit: int) -> list[dict[str, Any]]:
|
||||||
|
entries = _vm_vector(f"topk({limit}, sum by (namespace) ({expr}))")
|
||||||
|
entries = _filter_namespace_vector(entries)
|
||||||
|
return _vector_to_named(entries, "namespace", "namespace")
|
||||||
|
|
||||||
|
|
||||||
def _pod_waiting_now() -> dict[str, list[dict[str, Any]]]:
|
def _pod_waiting_now() -> dict[str, list[dict[str, Any]]]:
|
||||||
output: dict[str, list[dict[str, Any]]] = {}
|
output: dict[str, list[dict[str, Any]]] = {}
|
||||||
for key, reason in _POD_WAITING_REASONS.items():
|
for key, reason in _POD_WAITING_REASONS.items():
|
||||||
@ -2215,6 +2222,40 @@ def _collect_trend_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
|
|||||||
errors.append(f"trends: {exc}")
|
errors.append(f"trends: {exc}")
|
||||||
|
|
||||||
|
|
||||||
|
def _collect_issue_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
|
||||||
|
try:
|
||||||
|
waiting_series = "kube_pod_container_status_waiting_reason"
|
||||||
|
terminated_series = "kube_pod_container_status_terminated_reason"
|
||||||
|
metrics["namespace_issue_top"] = {
|
||||||
|
"crash_loop": _namespace_reason_entries(
|
||||||
|
f'{waiting_series}{{reason="CrashLoopBackOff"}}',
|
||||||
|
_NAMESPACE_ISSUE_LIMIT,
|
||||||
|
),
|
||||||
|
"image_pull": _namespace_reason_entries(
|
||||||
|
f'{waiting_series}{{reason="ImagePullBackOff"}}',
|
||||||
|
_NAMESPACE_ISSUE_LIMIT,
|
||||||
|
),
|
||||||
|
"err_image_pull": _namespace_reason_entries(
|
||||||
|
f'{waiting_series}{{reason="ErrImagePull"}}',
|
||||||
|
_NAMESPACE_ISSUE_LIMIT,
|
||||||
|
),
|
||||||
|
"config_error": _namespace_reason_entries(
|
||||||
|
f'{waiting_series}{{reason="CreateContainerConfigError"}}',
|
||||||
|
_NAMESPACE_ISSUE_LIMIT,
|
||||||
|
),
|
||||||
|
"oom_killed": _namespace_reason_entries(
|
||||||
|
f'{terminated_series}{{reason="OOMKilled"}}',
|
||||||
|
_NAMESPACE_ISSUE_LIMIT,
|
||||||
|
),
|
||||||
|
"terminated_error": _namespace_reason_entries(
|
||||||
|
f'{terminated_series}{{reason="Error"}}',
|
||||||
|
_NAMESPACE_ISSUE_LIMIT,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
except Exception as exc:
|
||||||
|
errors.append(f"issues: {exc}")
|
||||||
|
|
||||||
|
|
||||||
def _collect_alert_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
|
def _collect_alert_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
|
||||||
try:
|
try:
|
||||||
vm_now = _vm_alerts_now()
|
vm_now = _vm_alerts_now()
|
||||||
@ -2347,6 +2388,7 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
|||||||
_collect_trend_metrics(metrics, errors)
|
_collect_trend_metrics(metrics, errors)
|
||||||
_collect_alert_metrics(metrics, errors)
|
_collect_alert_metrics(metrics, errors)
|
||||||
_collect_namespace_metrics(metrics, errors)
|
_collect_namespace_metrics(metrics, errors)
|
||||||
|
_collect_issue_metrics(metrics, errors)
|
||||||
metrics["pvc_usage_top"] = _pvc_usage(errors)
|
metrics["pvc_usage_top"] = _pvc_usage(errors)
|
||||||
metrics["trend_summary"] = _trend_summary(metrics)
|
metrics["trend_summary"] = _trend_summary(metrics)
|
||||||
_finalize_metrics(metrics)
|
_finalize_metrics(metrics)
|
||||||
@ -2389,6 +2431,7 @@ def _build_offenders(metrics: dict[str, Any]) -> dict[str, Any]:
|
|||||||
offenders["pod_terminated_now"] = metrics.get("pod_terminated_now") or {}
|
offenders["pod_terminated_now"] = metrics.get("pod_terminated_now") or {}
|
||||||
offenders["job_failures_24h"] = metrics.get("job_failures_24h") or []
|
offenders["job_failures_24h"] = metrics.get("job_failures_24h") or []
|
||||||
offenders["pvc_pressure"] = _pvc_pressure_entries(metrics)
|
offenders["pvc_pressure"] = _pvc_pressure_entries(metrics)
|
||||||
|
offenders["namespace_issues"] = metrics.get("namespace_issue_top") or {}
|
||||||
return offenders
|
return offenders
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user