atlasbot: add job failure metrics to snapshot
This commit is contained in:
parent
281118b810
commit
e73c1a4e1c
@ -1083,6 +1083,9 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
|||||||
"topk(5, sum by (namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"}))"
|
"topk(5, sum by (namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"}))"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
metrics["job_failures_24h"] = _vm_vector(
|
||||||
|
"topk(5, sum by (namespace,job_name) (increase(kube_job_status_failed[24h])))"
|
||||||
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
errors.append(f"vm: {exc}")
|
errors.append(f"vm: {exc}")
|
||||||
metrics["postgres_connections"] = _postgres_connections(errors)
|
metrics["postgres_connections"] = _postgres_connections(errors)
|
||||||
@ -1118,6 +1121,7 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
|||||||
"restarts": "count",
|
"restarts": "count",
|
||||||
"pod_cpu": "cores",
|
"pod_cpu": "cores",
|
||||||
"pod_mem": "bytes",
|
"pod_mem": "bytes",
|
||||||
|
"job_failures_24h": "count",
|
||||||
"namespace_cpu": "cores",
|
"namespace_cpu": "cores",
|
||||||
"namespace_mem": "bytes",
|
"namespace_mem": "bytes",
|
||||||
"pvc_used_percent": "percent",
|
"pvc_used_percent": "percent",
|
||||||
|
|||||||
@ -126,6 +126,7 @@ def test_collect_cluster_state(monkeypatch) -> None:
|
|||||||
assert snapshot["metrics"]["namespace_mem_top"] == []
|
assert snapshot["metrics"]["namespace_mem_top"] == []
|
||||||
assert snapshot["metrics"]["pod_cpu_top"] == []
|
assert snapshot["metrics"]["pod_cpu_top"] == []
|
||||||
assert snapshot["metrics"]["pod_mem_top"] == []
|
assert snapshot["metrics"]["pod_mem_top"] == []
|
||||||
|
assert snapshot["metrics"]["job_failures_24h"] == []
|
||||||
assert snapshot["metrics"]["pvc_usage_top"] == []
|
assert snapshot["metrics"]["pvc_usage_top"] == []
|
||||||
assert summary.nodes_total == 2
|
assert summary.nodes_total == 2
|
||||||
assert summary.nodes_ready == 1
|
assert summary.nodes_ready == 1
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user