diff --git a/ariadne/services/cluster_state.py b/ariadne/services/cluster_state.py index 669a210..0f16e44 100644 --- a/ariadne/services/cluster_state.py +++ b/ariadne/services/cluster_state.py @@ -64,6 +64,7 @@ _PHASE_SEVERITY = { _PENDING_15M_HOURS = 0.25 _LOAD_TOP_COUNT = 5 _NAMESPACE_TOP_COUNT = 5 +_PVC_PRESSURE_THRESHOLD = 80.0 def _node_usage_by_hardware(node_load: list[dict[str, Any]], node_details: list[dict[str, Any]]) -> list[dict[str, Any]]: @@ -1939,6 +1940,17 @@ def _build_anomalies( events: dict[str, Any], ) -> list[dict[str, Any]]: anomalies: list[dict[str, Any]] = [] + _append_pod_anomalies(anomalies, metrics) + _append_workload_anomalies(anomalies, workloads_health) + _append_flux_anomalies(anomalies, kustomizations) + _append_job_failure_anomalies(anomalies, metrics) + _append_pvc_anomalies(anomalies, metrics) + _append_node_anomalies(anomalies, nodes_summary) + _append_event_anomalies(anomalies, events) + return anomalies + + +def _append_pod_anomalies(anomalies: list[dict[str, Any]], metrics: dict[str, Any]) -> None: pods_pending = metrics.get("pods_pending") or 0 pods_failed = metrics.get("pods_failed") or 0 if pods_pending: @@ -1957,6 +1969,11 @@ def _build_anomalies( "summary": f"{int(pods_failed)} pods failed", } ) + + +def _append_workload_anomalies( + anomalies: list[dict[str, Any]], workloads_health: dict[str, Any] +) -> None: for key in ("deployments", "statefulsets", "daemonsets"): entry = workloads_health.get(key) if isinstance(workloads_health.get(key), dict) else {} not_ready = entry.get("not_ready") or 0 @@ -1969,6 +1986,9 @@ def _build_anomalies( "items": entry.get("items"), } ) + + +def _append_flux_anomalies(anomalies: list[dict[str, Any]], kustomizations: dict[str, Any]) -> None: flux_not_ready = (kustomizations or {}).get("not_ready") or 0 if flux_not_ready: anomalies.append( @@ -1979,6 +1999,9 @@ def _build_anomalies( "items": (kustomizations or {}).get("items"), } ) + + +def _append_job_failure_anomalies(anomalies: list[dict[str, Any]], metrics: dict[str, Any]) -> None: job_failures = metrics.get("job_failures_24h") or [] job_failures = [ entry for entry in job_failures if isinstance(entry, dict) and (entry.get("value") or 0) > 0 @@ -1992,53 +2015,73 @@ def _build_anomalies( "items": job_failures[:5], } ) - pvc_top = _pvc_top(metrics.get("pvc_usage_top") or []) - pvc_pressure = [entry for entry in pvc_top if (entry.get("used_percent") or 0) >= 80] + + +def _append_pvc_anomalies(anomalies: list[dict[str, Any]], metrics: dict[str, Any]) -> None: + pvc_pressure = _pvc_pressure_entries(metrics) if pvc_pressure: anomalies.append( { "kind": "pvc_pressure", "severity": "warning", - "summary": "PVCs above 80% usage", + "summary": f"PVCs above {_PVC_PRESSURE_THRESHOLD:.0f}% usage", "items": pvc_pressure[:5], } ) - if nodes_summary: - pressure_nodes = nodes_summary.get("pressure_nodes") or {} - flagged = [ - name for names in pressure_nodes.values() if isinstance(names, list) for name in names if name - ] - if flagged: - anomalies.append( - { - "kind": "node_pressure", - "severity": "warning", - "summary": f"{len(flagged)} nodes report pressure", - "items": sorted(set(flagged)), - } - ) - unschedulable = nodes_summary.get("unschedulable_nodes") or [] - if unschedulable: - anomalies.append( - { - "kind": "unschedulable_nodes", - "severity": "info", - "summary": f"{len(unschedulable)} nodes unschedulable", - "items": unschedulable, - } - ) - if events: - warnings = events.get("warnings_total") or 0 - if warnings: - anomalies.append( - { - "kind": "event_warnings", - "severity": "info", - "summary": f"{int(warnings)} warning events", - "items": events.get("warnings") or [], - } - ) - return anomalies + + +def _pvc_pressure_entries(metrics: dict[str, Any]) -> list[dict[str, Any]]: + pvc_top = _pvc_top(metrics.get("pvc_usage_top") or []) + return [ + entry + for entry in pvc_top + if isinstance(entry, dict) + and isinstance(entry.get("used_percent"), (int, float)) + and float(entry.get("used_percent") or 0) >= _PVC_PRESSURE_THRESHOLD + ] + + +def _append_node_anomalies(anomalies: list[dict[str, Any]], nodes_summary: dict[str, Any]) -> None: + if not nodes_summary: + return + pressure_nodes = nodes_summary.get("pressure_nodes") or {} + flagged = [ + name for names in pressure_nodes.values() if isinstance(names, list) for name in names if name + ] + if flagged: + anomalies.append( + { + "kind": "node_pressure", + "severity": "warning", + "summary": f"{len(flagged)} nodes report pressure", + "items": sorted(set(flagged)), + } + ) + unschedulable = nodes_summary.get("unschedulable_nodes") or [] + if unschedulable: + anomalies.append( + { + "kind": "unschedulable_nodes", + "severity": "info", + "summary": f"{len(unschedulable)} nodes unschedulable", + "items": unschedulable, + } + ) + + +def _append_event_anomalies(anomalies: list[dict[str, Any]], events: dict[str, Any]) -> None: + if not events: + return + warnings = events.get("warnings_total") or 0 + if warnings: + anomalies.append( + { + "kind": "event_warnings", + "severity": "info", + "summary": f"{int(warnings)} warning events", + "items": events.get("warnings") or [], + } + ) def _health_bullets(