atlasbot: expose job failure metrics

This commit is contained in:
Brad Stein 2026-01-29 03:00:22 -03:00
parent 2e86985fe4
commit 46855343ce

View File

@ -660,6 +660,23 @@ def _append_restarts(lines: list[str], summary: dict[str, Any]) -> None:
lines.append("restarts_1h_top: " + "; ".join(parts)) lines.append("restarts_1h_top: " + "; ".join(parts))
def _append_job_failures(lines: list[str], summary: dict[str, Any]) -> None:
metrics = summary.get("metrics") if isinstance(summary.get("metrics"), dict) else {}
failures = metrics.get("job_failures_24h") if isinstance(metrics.get("job_failures_24h"), list) else []
if not failures:
return
parts = []
for entry in failures:
metric = entry.get("metric") if isinstance(entry, dict) else {}
namespace = metric.get("namespace")
job_name = metric.get("job_name") or metric.get("job")
value = entry.get("value")
if namespace and job_name and value is not None:
parts.append(f"{namespace}/{job_name}={_format_float(value)}")
if parts:
lines.append("job_failures_24h: " + "; ".join(parts))
def _append_postgres(lines: list[str], summary: dict[str, Any]) -> None: def _append_postgres(lines: list[str], summary: dict[str, Any]) -> None:
postgres = summary.get("postgres") if isinstance(summary.get("postgres"), dict) else {} postgres = summary.get("postgres") if isinstance(summary.get("postgres"), dict) else {}
if not postgres: if not postgres:
@ -762,6 +779,7 @@ def summary_text(snapshot: dict[str, Any] | None) -> str:
_append_namespace_usage(lines, summary) _append_namespace_usage(lines, summary)
_append_pod_usage(lines, summary) _append_pod_usage(lines, summary)
_append_restarts(lines, summary) _append_restarts(lines, summary)
_append_job_failures(lines, summary)
_append_postgres(lines, summary) _append_postgres(lines, summary)
_append_hottest(lines, summary) _append_hottest(lines, summary)
_append_pvc_usage(lines, summary) _append_pvc_usage(lines, summary)