atlasbot: add job failure metrics to snapshot

This commit is contained in:
Brad Stein 2026-01-29 03:00:00 -03:00
parent 281118b810
commit e73c1a4e1c
2 changed files with 5 additions and 0 deletions

View File

@ -1083,6 +1083,9 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
"topk(5, sum by (namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"}))"
)
)
metrics["job_failures_24h"] = _vm_vector(
"topk(5, sum by (namespace,job_name) (increase(kube_job_status_failed[24h])))"
)
except Exception as exc:
errors.append(f"vm: {exc}")
metrics["postgres_connections"] = _postgres_connections(errors)
@ -1118,6 +1121,7 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
"restarts": "count",
"pod_cpu": "cores",
"pod_mem": "bytes",
"job_failures_24h": "count",
"namespace_cpu": "cores",
"namespace_mem": "bytes",
"pvc_used_percent": "percent",

View File

@ -126,6 +126,7 @@ def test_collect_cluster_state(monkeypatch) -> None:
assert snapshot["metrics"]["namespace_mem_top"] == []
assert snapshot["metrics"]["pod_cpu_top"] == []
assert snapshot["metrics"]["pod_mem_top"] == []
assert snapshot["metrics"]["job_failures_24h"] == []
assert snapshot["metrics"]["pvc_usage_top"] == []
assert summary.nodes_total == 2
assert summary.nodes_ready == 1