titan-iac/services/monitoring/scripts/platform_quality_suite_probe.sh

87 lines
2.5 KiB
Bash
Raw Permalink Normal View History

#!/usr/bin/env sh
set -eu
PUSHGATEWAY_URL="${PUSHGATEWAY_URL:-http://platform-quality-gateway.monitoring.svc.cluster.local:9091}"
HTTP_TIMEOUT_SECONDS="${HTTP_TIMEOUT_SECONDS:-12}"
fetch_counter() {
suite="$1"
status="$2"
line="$(curl -fsS "${PUSHGATEWAY_URL}/metrics" 2>/dev/null | awk -v suite="$suite" -v status="$status" '
/^platform_quality_gate_runs_total\{/ {
if (index($0, "suite=\"" suite "\"") && index($0, "status=\"" status "\"")) {
print $0
exit
}
}
' || true)"
if [ -z "${line}" ]; then
printf '0\n'
return 0
fi
printf '%s\n' "${line}" | awk '{print $2 + 0}'
}
push_suite_counters() {
suite="$1"
outcome="$2"
ok_count="$(fetch_counter "${suite}" "ok")"
failed_count="$(fetch_counter "${suite}" "failed")"
if [ "${outcome}" = "ok" ]; then
ok_count=$((ok_count + 1))
else
failed_count=$((failed_count + 1))
fi
cat <<METRICS | curl -fsS -X PUT --data-binary @- "${PUSHGATEWAY_URL}/metrics/job/platform-quality-suite-probe/suite/${suite}" >/dev/null
# TYPE platform_quality_gate_runs_total counter
platform_quality_gate_runs_total{suite="${suite}",status="ok"} ${ok_count}
platform_quality_gate_runs_total{suite="${suite}",status="failed"} ${failed_count}
METRICS
}
check_http_suite() {
suite="$1"
url="$2"
expected_code="$3"
body_match="${4:-}"
body_file="$(mktemp)"
code="$(curl -ksS -m "${HTTP_TIMEOUT_SECONDS}" -o "${body_file}" -w '%{http_code}' "${url}" || true)"
outcome="failed"
if [ "${code}" = "${expected_code}" ]; then
if [ -z "${body_match}" ] || grep -q -- "${body_match}" "${body_file}"; then
outcome="ok"
fi
fi
rm -f "${body_file}"
push_suite_counters "${suite}" "${outcome}"
if [ "${outcome}" = "ok" ]; then
printf '[probe] suite=%s outcome=ok url=%s\n' "${suite}" "${url}"
return 0
fi
printf '[probe] suite=%s outcome=failed url=%s code=%s\n' "${suite}" "${url}" "${code}" >&2
return 1
}
failures=0
check_http_suite "atlasbot" "http://atlasbot.comms.svc.cluster.local:8090/health" "200" '"status": "ok"' || failures=$((failures + 1))
check_http_suite "pegasus" "http://pegasus.jellyfin.svc.cluster.local/healthz" "200" || failures=$((failures + 1))
check_http_suite "bstein_home" "http://bstein-dev-home-backend.bstein-dev-home.svc.cluster.local/api/healthz" "200" || failures=$((failures + 1))
if [ "${failures}" -gt 0 ]; then
printf '[probe] completed with %s suite failure(s)\n' "${failures}" >&2
else
printf '[probe] completed with all suites passing\n'
fi
# Report failures through metrics, not Job failure retries.
exit 0