atlasbot: add job failure metrics to snapshot
This commit is contained in:
parent
281118b810
commit
e73c1a4e1c
@ -1083,6 +1083,9 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
||||
"topk(5, sum by (namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"}))"
|
||||
)
|
||||
)
|
||||
metrics["job_failures_24h"] = _vm_vector(
|
||||
"topk(5, sum by (namespace,job_name) (increase(kube_job_status_failed[24h])))"
|
||||
)
|
||||
except Exception as exc:
|
||||
errors.append(f"vm: {exc}")
|
||||
metrics["postgres_connections"] = _postgres_connections(errors)
|
||||
@ -1118,6 +1121,7 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
||||
"restarts": "count",
|
||||
"pod_cpu": "cores",
|
||||
"pod_mem": "bytes",
|
||||
"job_failures_24h": "count",
|
||||
"namespace_cpu": "cores",
|
||||
"namespace_mem": "bytes",
|
||||
"pvc_used_percent": "percent",
|
||||
|
||||
@ -126,6 +126,7 @@ def test_collect_cluster_state(monkeypatch) -> None:
|
||||
assert snapshot["metrics"]["namespace_mem_top"] == []
|
||||
assert snapshot["metrics"]["pod_cpu_top"] == []
|
||||
assert snapshot["metrics"]["pod_mem_top"] == []
|
||||
assert snapshot["metrics"]["job_failures_24h"] == []
|
||||
assert snapshot["metrics"]["pvc_usage_top"] == []
|
||||
assert summary.nodes_total == 2
|
||||
assert summary.nodes_ready == 1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user