From e73c1a4e1ca4fffa8d2d6cb4de972f8aeb8005f2 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 29 Jan 2026 03:00:00 -0300 Subject: [PATCH] atlasbot: add job failure metrics to snapshot --- ariadne/services/cluster_state.py | 4 ++++ tests/test_cluster_state.py | 1 + 2 files changed, 5 insertions(+) diff --git a/ariadne/services/cluster_state.py b/ariadne/services/cluster_state.py index 9f9c0ce..8df5b46 100644 --- a/ariadne/services/cluster_state.py +++ b/ariadne/services/cluster_state.py @@ -1083,6 +1083,9 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]: "topk(5, sum by (namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"}))" ) ) + metrics["job_failures_24h"] = _vm_vector( + "topk(5, sum by (namespace,job_name) (increase(kube_job_status_failed[24h])))" + ) except Exception as exc: errors.append(f"vm: {exc}") metrics["postgres_connections"] = _postgres_connections(errors) @@ -1118,6 +1121,7 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]: "restarts": "count", "pod_cpu": "cores", "pod_mem": "bytes", + "job_failures_24h": "count", "namespace_cpu": "cores", "namespace_mem": "bytes", "pvc_used_percent": "percent", diff --git a/tests/test_cluster_state.py b/tests/test_cluster_state.py index 42f5a4a..b3126cf 100644 --- a/tests/test_cluster_state.py +++ b/tests/test_cluster_state.py @@ -126,6 +126,7 @@ def test_collect_cluster_state(monkeypatch) -> None: assert snapshot["metrics"]["namespace_mem_top"] == [] assert snapshot["metrics"]["pod_cpu_top"] == [] assert snapshot["metrics"]["pod_mem_top"] == [] + assert snapshot["metrics"]["job_failures_24h"] == [] assert snapshot["metrics"]["pvc_usage_top"] == [] assert summary.nodes_total == 2 assert summary.nodes_ready == 1