#!/usr/bin/env sh set -eu PUSHGATEWAY_URL="${PUSHGATEWAY_URL:-http://platform-quality-gateway.monitoring.svc.cluster.local:9091}" HTTP_TIMEOUT_SECONDS="${HTTP_TIMEOUT_SECONDS:-12}" fetch_counter() { suite="$1" status="$2" line="$(curl -fsS "${PUSHGATEWAY_URL}/metrics" 2>/dev/null | awk -v suite="$suite" -v status="$status" ' /^platform_quality_gate_runs_total\{/ { if (index($0, "suite=\"" suite "\"") && index($0, "status=\"" status "\"")) { print $0 exit } } ' || true)" if [ -z "${line}" ]; then printf '0\n' return 0 fi printf '%s\n' "${line}" | awk '{print $2 + 0}' } push_suite_counters() { suite="$1" outcome="$2" ok_count="$(fetch_counter "${suite}" "ok")" failed_count="$(fetch_counter "${suite}" "failed")" if [ "${outcome}" = "ok" ]; then ok_count=$((ok_count + 1)) else failed_count=$((failed_count + 1)) fi cat </dev/null # TYPE platform_quality_gate_runs_total counter platform_quality_gate_runs_total{suite="${suite}",status="ok"} ${ok_count} platform_quality_gate_runs_total{suite="${suite}",status="failed"} ${failed_count} METRICS } check_http_suite() { suite="$1" url="$2" expected_code="$3" body_match="${4:-}" body_file="$(mktemp)" code="$(curl -ksS -m "${HTTP_TIMEOUT_SECONDS}" -o "${body_file}" -w '%{http_code}' "${url}" || true)" outcome="failed" if [ "${code}" = "${expected_code}" ]; then if [ -z "${body_match}" ] || grep -q -- "${body_match}" "${body_file}"; then outcome="ok" fi fi rm -f "${body_file}" push_suite_counters "${suite}" "${outcome}" if [ "${outcome}" = "ok" ]; then printf '[probe] suite=%s outcome=ok url=%s\n' "${suite}" "${url}" return 0 fi printf '[probe] suite=%s outcome=failed url=%s code=%s\n' "${suite}" "${url}" "${code}" >&2 return 1 } failures=0 check_http_suite "atlasbot" "http://atlasbot.comms.svc.cluster.local:8090/health" "200" '"status": "ok"' || failures=$((failures + 1)) check_http_suite "pegasus" "http://pegasus.jellyfin.svc.cluster.local/healthz" "200" || failures=$((failures + 1)) check_http_suite "bstein_home" "http://bstein-dev-home-backend.bstein-dev-home.svc.cluster.local/api/healthz" "200" || failures=$((failures + 1)) if [ "${failures}" -gt 0 ]; then printf '[probe] completed with %s suite failure(s)\n' "${failures}" >&2 else printf '[probe] completed with all suites passing\n' fi # Report failures through metrics, not Job failure retries. exit 0