181 lines
6.2 KiB
Python
181 lines
6.2 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import Any
|
|
|
|
from .cluster_state_contract import *
|
|
from .cluster_state_relationships import *
|
|
|
|
def _severity_rank(value: Any) -> int:
|
|
if value == "critical":
|
|
return 0
|
|
if value == "warning":
|
|
return 1
|
|
return 2
|
|
|
|
|
|
def _pvc_pressure_signals(metrics: dict[str, Any]) -> list[dict[str, Any]]:
|
|
pvc_top = _pvc_top(metrics.get("pvc_usage_top", []))
|
|
if not pvc_top:
|
|
return []
|
|
output: list[dict[str, Any]] = []
|
|
for entry in pvc_top:
|
|
used = entry.get("used_percent")
|
|
if not isinstance(used, (int, float)) or used < _PVC_PRESSURE_THRESHOLD:
|
|
continue
|
|
output.append(
|
|
{
|
|
"scope": "pvc",
|
|
"target": f"{entry.get('namespace')}/{entry.get('pvc')}",
|
|
"metric": "used_percent",
|
|
"current": used,
|
|
"severity": "warning" if used < _PVC_CRITICAL_THRESHOLD else "critical",
|
|
}
|
|
)
|
|
return output
|
|
|
|
|
|
def _build_anomalies(metrics: dict[str, Any], nodes_summary: dict[str, Any], workloads_health: dict[str, Any], kustomizations: dict[str, Any], events: dict[str, Any]) -> list[dict[str, Any]]:
|
|
anomalies: list[dict[str, Any]] = []
|
|
_append_pod_anomalies(anomalies, metrics)
|
|
_append_workload_anomalies(anomalies, workloads_health)
|
|
_append_flux_anomalies(anomalies, kustomizations)
|
|
_append_job_failure_anomalies(anomalies, metrics)
|
|
_append_pvc_anomalies(anomalies, metrics)
|
|
_append_node_anomalies(anomalies, nodes_summary)
|
|
_append_event_anomalies(anomalies, events)
|
|
return anomalies
|
|
|
|
|
|
def _append_pod_anomalies(anomalies: list[dict[str, Any]], metrics: dict[str, Any]) -> None:
|
|
pods_pending = metrics.get("pods_pending") or 0
|
|
pods_failed = metrics.get("pods_failed") or 0
|
|
if pods_pending:
|
|
anomalies.append(
|
|
{
|
|
"kind": "pods_pending",
|
|
"severity": "warning",
|
|
"summary": f"{int(pods_pending)} pods pending",
|
|
}
|
|
)
|
|
if pods_failed:
|
|
anomalies.append(
|
|
{
|
|
"kind": "pods_failed",
|
|
"severity": "critical",
|
|
"summary": f"{int(pods_failed)} pods failed",
|
|
}
|
|
)
|
|
|
|
|
|
def _append_workload_anomalies(anomalies: list[dict[str, Any]], workloads_health: dict[str, Any]) -> None:
|
|
for key in ("deployments", "statefulsets", "daemonsets"):
|
|
entry = workloads_health.get(key) if isinstance(workloads_health.get(key), dict) else {}
|
|
not_ready = entry.get("not_ready") or 0
|
|
if not_ready:
|
|
anomalies.append(
|
|
{
|
|
"kind": f"{key}_not_ready",
|
|
"severity": "warning",
|
|
"summary": f"{int(not_ready)} {key} not ready",
|
|
"items": entry.get("items"),
|
|
}
|
|
)
|
|
|
|
|
|
def _append_flux_anomalies(anomalies: list[dict[str, Any]], kustomizations: dict[str, Any]) -> None:
|
|
flux_not_ready = (kustomizations or {}).get("not_ready") or 0
|
|
if flux_not_ready:
|
|
anomalies.append(
|
|
{
|
|
"kind": "flux_not_ready",
|
|
"severity": "warning",
|
|
"summary": f"{int(flux_not_ready)} Flux kustomizations not ready",
|
|
"items": (kustomizations or {}).get("items"),
|
|
}
|
|
)
|
|
|
|
|
|
def _append_job_failure_anomalies(anomalies: list[dict[str, Any]], metrics: dict[str, Any]) -> None:
|
|
job_failures = metrics.get("job_failures_24h") or []
|
|
job_failures = [
|
|
entry for entry in job_failures if isinstance(entry, dict) and (entry.get("value") or 0) > 0
|
|
]
|
|
if job_failures:
|
|
anomalies.append(
|
|
{
|
|
"kind": "job_failures_24h",
|
|
"severity": "warning",
|
|
"summary": "Job failures in last 24h",
|
|
"items": job_failures[:5],
|
|
}
|
|
)
|
|
|
|
|
|
def _append_pvc_anomalies(anomalies: list[dict[str, Any]], metrics: dict[str, Any]) -> None:
|
|
pvc_pressure = _pvc_pressure_entries(metrics)
|
|
if pvc_pressure:
|
|
anomalies.append(
|
|
{
|
|
"kind": "pvc_pressure",
|
|
"severity": "warning",
|
|
"summary": f"PVCs above {_PVC_PRESSURE_THRESHOLD:.0f}% usage",
|
|
"items": pvc_pressure[:5],
|
|
}
|
|
)
|
|
|
|
|
|
def _pvc_pressure_entries(metrics: dict[str, Any]) -> list[dict[str, Any]]:
|
|
pvc_top = _pvc_top(metrics.get("pvc_usage_top") or [])
|
|
return [
|
|
entry
|
|
for entry in pvc_top
|
|
if isinstance(entry, dict)
|
|
and isinstance(entry.get("used_percent"), (int, float))
|
|
and float(entry.get("used_percent") or 0) >= _PVC_PRESSURE_THRESHOLD
|
|
]
|
|
|
|
|
|
def _append_node_anomalies(anomalies: list[dict[str, Any]], nodes_summary: dict[str, Any]) -> None:
|
|
if not nodes_summary:
|
|
return
|
|
pressure_nodes = nodes_summary.get("pressure_nodes") or {}
|
|
flagged = [
|
|
name for names in pressure_nodes.values() if isinstance(names, list) for name in names if name
|
|
]
|
|
if flagged:
|
|
anomalies.append(
|
|
{
|
|
"kind": "node_pressure",
|
|
"severity": "warning",
|
|
"summary": f"{len(flagged)} nodes report pressure",
|
|
"items": sorted(set(flagged)),
|
|
}
|
|
)
|
|
unschedulable = nodes_summary.get("unschedulable_nodes") or []
|
|
if unschedulable:
|
|
anomalies.append(
|
|
{
|
|
"kind": "unschedulable_nodes",
|
|
"severity": "info",
|
|
"summary": f"{len(unschedulable)} nodes unschedulable",
|
|
"items": unschedulable,
|
|
}
|
|
)
|
|
|
|
|
|
def _append_event_anomalies(anomalies: list[dict[str, Any]], events: dict[str, Any]) -> None:
|
|
if not events:
|
|
return
|
|
warnings = events.get("warnings_total") or 0
|
|
if warnings:
|
|
anomalies.append(
|
|
{
|
|
"kind": "event_warnings",
|
|
"severity": "info",
|
|
"summary": f"{int(warnings)} warning events",
|
|
"items": events.get("warnings") or [],
|
|
}
|
|
)
|
|
|
|
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]
|