From 530f4406790cd06292b046fbb984ac3f2751cce1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 9 Apr 2026 20:10:52 -0300 Subject: [PATCH] monitoring: add suite probe metrics and align fan labels --- scripts/dashboards_render_atlas.py | 34 ++++---- .../monitoring/dashboards/atlas-overview.json | 16 ++-- .../monitoring/dashboards/atlas-power.json | 18 ++-- .../grafana-dashboard-overview.yaml | 16 ++-- .../monitoring/grafana-dashboard-power.yaml | 18 ++-- services/monitoring/kustomization.yaml | 7 ++ .../platform-quality-suite-probe-cronjob.yaml | 39 +++++++++ .../scripts/platform_quality_suite_probe.sh | 86 +++++++++++++++++++ 8 files changed, 183 insertions(+), 51 deletions(-) create mode 100644 services/monitoring/platform-quality-suite-probe-cronjob.yaml create mode 100755 services/monitoring/scripts/platform_quality_suite_probe.sh diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 0cf7f466..4438fb4a 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -1468,10 +1468,10 @@ def build_overview(): decimals=0, text_mode="name_and_value", targets=[ - {"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Outlet", "instant": True}, - {"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "In Inlet", "instant": True}, - {"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Out Inlet", "instant": True}, - {"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior", "instant": True}, + {"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Inside Outlet", "instant": True}, + {"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "Inside Inlet", "instant": True}, + {"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Outside Inlet", "instant": True}, + {"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior Fans", "instant": True}, ], thresholds={ "mode": "absolute", @@ -1493,10 +1493,10 @@ def build_overview(): unit="none", max_value=10, targets=[ - {"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Outlet"}, - {"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "In Inlet"}, - {"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Out Inlet"}, - {"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior"}, + {"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Inside Outlet"}, + {"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "Inside Inlet"}, + {"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Outside Inlet"}, + {"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior Fans"}, ], legend_display="list", legend_placement="bottom", @@ -3223,10 +3223,10 @@ def build_power_dashboard(): decimals=0, text_mode="name_and_value", targets=[ - {"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Outlet", "instant": True}, - {"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "In Inlet", "instant": True}, - {"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Out Inlet", "instant": True}, - {"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior", "instant": True}, + {"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Inside Outlet", "instant": True}, + {"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "Inside Inlet", "instant": True}, + {"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Outside Inlet", "instant": True}, + {"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior Fans", "instant": True}, ], thresholds={ "mode": "absolute", @@ -3236,7 +3236,7 @@ def build_power_dashboard(): {"color": "red", "value": 9}, ], }, - description="Current fan activity levels (0-10): outlet, inside inlet, outside inlet, and interior.", + description="Current fan activity levels (0-10): inside outlet, inside inlet, outside inlet, and interior fans.", ) ) panels.append( @@ -3248,10 +3248,10 @@ def build_power_dashboard(): unit="none", max_value=10, targets=[ - {"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Outlet"}, - {"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "In Inlet"}, - {"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Out Inlet"}, - {"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior"}, + {"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Inside Outlet"}, + {"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "Inside Inlet"}, + {"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Outside Inlet"}, + {"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior Fans"}, ], legend_display="table", legend_placement="right", diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 72dedcf4..a9a925d8 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1542,25 +1542,25 @@ { "refId": "A", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outlet\"}) or max(atlas_climate_fan_activity_level{position=\"outlet\"}) or on() vector(0))", - "legendFormat": "Outlet", + "legendFormat": "Inside Outlet", "instant": true }, { "refId": "B", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"inside_inlet\"}) or on() vector(0))", - "legendFormat": "In Inlet", + "legendFormat": "Inside Inlet", "instant": true }, { "refId": "C", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"outside_inlet\"}) or on() vector(0))", - "legendFormat": "Out Inlet", + "legendFormat": "Outside Inlet", "instant": true }, { "refId": "D", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"interior\"}) or max(atlas_climate_fan_activity_level{position=\"interior\"}) or on() vector(0))", - "legendFormat": "Interior", + "legendFormat": "Interior Fans", "instant": true } ], @@ -1634,22 +1634,22 @@ { "refId": "A", "expr": "(atlas_climate_fan_activity_level{fan_group=\"outlet\"} or atlas_climate_fan_activity_level{position=\"outlet\"})", - "legendFormat": "Outlet" + "legendFormat": "Inside Outlet" }, { "refId": "B", "expr": "(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"} or atlas_climate_fan_activity_level{position=\"inside_inlet\"})", - "legendFormat": "In Inlet" + "legendFormat": "Inside Inlet" }, { "refId": "C", "expr": "(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"} or atlas_climate_fan_activity_level{position=\"outside_inlet\"})", - "legendFormat": "Out Inlet" + "legendFormat": "Outside Inlet" }, { "refId": "D", "expr": "(atlas_climate_fan_activity_level{fan_group=\"interior\"} or atlas_climate_fan_activity_level{position=\"interior\"})", - "legendFormat": "Interior" + "legendFormat": "Interior Fans" } ], "fieldConfig": { diff --git a/services/monitoring/dashboards/atlas-power.json b/services/monitoring/dashboards/atlas-power.json index c094cef3..9a8cceeb 100644 --- a/services/monitoring/dashboards/atlas-power.json +++ b/services/monitoring/dashboards/atlas-power.json @@ -448,25 +448,25 @@ { "refId": "A", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outlet\"}) or max(atlas_climate_fan_activity_level{position=\"outlet\"}) or on() vector(0))", - "legendFormat": "Outlet", + "legendFormat": "Inside Outlet", "instant": true }, { "refId": "B", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"inside_inlet\"}) or on() vector(0))", - "legendFormat": "In Inlet", + "legendFormat": "Inside Inlet", "instant": true }, { "refId": "C", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"outside_inlet\"}) or on() vector(0))", - "legendFormat": "Out Inlet", + "legendFormat": "Outside Inlet", "instant": true }, { "refId": "D", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"interior\"}) or max(atlas_climate_fan_activity_level{position=\"interior\"}) or on() vector(0))", - "legendFormat": "Interior", + "legendFormat": "Interior Fans", "instant": true } ], @@ -514,7 +514,7 @@ }, "textMode": "name_and_value" }, - "description": "Current fan activity levels (0-10): outlet, inside inlet, outside inlet, and interior." + "description": "Current fan activity levels (0-10): inside outlet, inside inlet, outside inlet, and interior fans." }, { "id": 6, @@ -534,22 +534,22 @@ { "refId": "A", "expr": "(atlas_climate_fan_activity_level{fan_group=\"outlet\"} or atlas_climate_fan_activity_level{position=\"outlet\"})", - "legendFormat": "Outlet" + "legendFormat": "Inside Outlet" }, { "refId": "B", "expr": "(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"} or atlas_climate_fan_activity_level{position=\"inside_inlet\"})", - "legendFormat": "In Inlet" + "legendFormat": "Inside Inlet" }, { "refId": "C", "expr": "(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"} or atlas_climate_fan_activity_level{position=\"outside_inlet\"})", - "legendFormat": "Out Inlet" + "legendFormat": "Outside Inlet" }, { "refId": "D", "expr": "(atlas_climate_fan_activity_level{fan_group=\"interior\"} or atlas_climate_fan_activity_level{position=\"interior\"})", - "legendFormat": "Interior" + "legendFormat": "Interior Fans" } ], "fieldConfig": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index a72b37f8..673347d5 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1551,25 +1551,25 @@ data: { "refId": "A", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outlet\"}) or max(atlas_climate_fan_activity_level{position=\"outlet\"}) or on() vector(0))", - "legendFormat": "Outlet", + "legendFormat": "Inside Outlet", "instant": true }, { "refId": "B", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"inside_inlet\"}) or on() vector(0))", - "legendFormat": "In Inlet", + "legendFormat": "Inside Inlet", "instant": true }, { "refId": "C", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"outside_inlet\"}) or on() vector(0))", - "legendFormat": "Out Inlet", + "legendFormat": "Outside Inlet", "instant": true }, { "refId": "D", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"interior\"}) or max(atlas_climate_fan_activity_level{position=\"interior\"}) or on() vector(0))", - "legendFormat": "Interior", + "legendFormat": "Interior Fans", "instant": true } ], @@ -1643,22 +1643,22 @@ data: { "refId": "A", "expr": "(atlas_climate_fan_activity_level{fan_group=\"outlet\"} or atlas_climate_fan_activity_level{position=\"outlet\"})", - "legendFormat": "Outlet" + "legendFormat": "Inside Outlet" }, { "refId": "B", "expr": "(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"} or atlas_climate_fan_activity_level{position=\"inside_inlet\"})", - "legendFormat": "In Inlet" + "legendFormat": "Inside Inlet" }, { "refId": "C", "expr": "(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"} or atlas_climate_fan_activity_level{position=\"outside_inlet\"})", - "legendFormat": "Out Inlet" + "legendFormat": "Outside Inlet" }, { "refId": "D", "expr": "(atlas_climate_fan_activity_level{fan_group=\"interior\"} or atlas_climate_fan_activity_level{position=\"interior\"})", - "legendFormat": "Interior" + "legendFormat": "Interior Fans" } ], "fieldConfig": { diff --git a/services/monitoring/grafana-dashboard-power.yaml b/services/monitoring/grafana-dashboard-power.yaml index 99884d29..3882c4bc 100644 --- a/services/monitoring/grafana-dashboard-power.yaml +++ b/services/monitoring/grafana-dashboard-power.yaml @@ -457,25 +457,25 @@ data: { "refId": "A", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outlet\"}) or max(atlas_climate_fan_activity_level{position=\"outlet\"}) or on() vector(0))", - "legendFormat": "Outlet", + "legendFormat": "Inside Outlet", "instant": true }, { "refId": "B", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"inside_inlet\"}) or on() vector(0))", - "legendFormat": "In Inlet", + "legendFormat": "Inside Inlet", "instant": true }, { "refId": "C", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"outside_inlet\"}) or on() vector(0))", - "legendFormat": "Out Inlet", + "legendFormat": "Outside Inlet", "instant": true }, { "refId": "D", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"interior\"}) or max(atlas_climate_fan_activity_level{position=\"interior\"}) or on() vector(0))", - "legendFormat": "Interior", + "legendFormat": "Interior Fans", "instant": true } ], @@ -523,7 +523,7 @@ data: }, "textMode": "name_and_value" }, - "description": "Current fan activity levels (0-10): outlet, inside inlet, outside inlet, and interior." + "description": "Current fan activity levels (0-10): inside outlet, inside inlet, outside inlet, and interior fans." }, { "id": 6, @@ -543,22 +543,22 @@ data: { "refId": "A", "expr": "(atlas_climate_fan_activity_level{fan_group=\"outlet\"} or atlas_climate_fan_activity_level{position=\"outlet\"})", - "legendFormat": "Outlet" + "legendFormat": "Inside Outlet" }, { "refId": "B", "expr": "(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"} or atlas_climate_fan_activity_level{position=\"inside_inlet\"})", - "legendFormat": "In Inlet" + "legendFormat": "Inside Inlet" }, { "refId": "C", "expr": "(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"} or atlas_climate_fan_activity_level{position=\"outside_inlet\"})", - "legendFormat": "Out Inlet" + "legendFormat": "Outside Inlet" }, { "refId": "D", "expr": "(atlas_climate_fan_activity_level{fan_group=\"interior\"} or atlas_climate_fan_activity_level{position=\"interior\"})", - "legendFormat": "Interior" + "legendFormat": "Interior Fans" } ], "fieldConfig": { diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index ab81b768..67580f60 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -23,6 +23,7 @@ resources: - platform-quality-gateway-pvc.yaml - platform-quality-gateway-service.yaml - platform-quality-gateway-deployment.yaml + - platform-quality-suite-probe-cronjob.yaml - vault-sync-deployment.yaml - grafana-alerting-config.yaml - grafana-folders.yaml @@ -49,3 +50,9 @@ configMapGenerator: - scripts/vault-entrypoint.sh options: disableNameSuffixHash: true + - name: platform-quality-suite-probe-script + namespace: monitoring + files: + - platform_quality_suite_probe.sh=scripts/platform_quality_suite_probe.sh + options: + disableNameSuffixHash: true diff --git a/services/monitoring/platform-quality-suite-probe-cronjob.yaml b/services/monitoring/platform-quality-suite-probe-cronjob.yaml new file mode 100644 index 00000000..8685ef11 --- /dev/null +++ b/services/monitoring/platform-quality-suite-probe-cronjob.yaml @@ -0,0 +1,39 @@ +# services/monitoring/platform-quality-suite-probe-cronjob.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: platform-quality-suite-probe + namespace: monitoring +spec: + schedule: "*/15 * * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 2 + failedJobsHistoryLimit: 2 + jobTemplate: + spec: + backoffLimit: 0 + template: + metadata: + labels: + app: platform-quality-suite-probe + spec: + restartPolicy: Never + containers: + - name: probe + image: curlimages/curl:8.12.1 + imagePullPolicy: IfNotPresent + command: ["/bin/sh", "/scripts/platform_quality_suite_probe.sh"] + env: + - name: PUSHGATEWAY_URL + value: http://platform-quality-gateway.monitoring.svc.cluster.local:9091 + - name: HTTP_TIMEOUT_SECONDS + value: "12" + volumeMounts: + - name: probe-script + mountPath: /scripts + readOnly: true + volumes: + - name: probe-script + configMap: + name: platform-quality-suite-probe-script + defaultMode: 0555 diff --git a/services/monitoring/scripts/platform_quality_suite_probe.sh b/services/monitoring/scripts/platform_quality_suite_probe.sh new file mode 100755 index 00000000..0991d676 --- /dev/null +++ b/services/monitoring/scripts/platform_quality_suite_probe.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env sh +set -eu + +PUSHGATEWAY_URL="${PUSHGATEWAY_URL:-http://platform-quality-gateway.monitoring.svc.cluster.local:9091}" +HTTP_TIMEOUT_SECONDS="${HTTP_TIMEOUT_SECONDS:-12}" + +fetch_counter() { + suite="$1" + status="$2" + line="$(curl -fsS "${PUSHGATEWAY_URL}/metrics" 2>/dev/null | awk -v suite="$suite" -v status="$status" ' + /^platform_quality_gate_runs_total\{/ { + if (index($0, "suite=\"" suite "\"") && index($0, "status=\"" status "\"")) { + print $0 + exit + } + } + ' || true)" + if [ -z "${line}" ]; then + printf '0\n' + return 0 + fi + printf '%s\n' "${line}" | awk '{print $2 + 0}' +} + +push_suite_counters() { + suite="$1" + outcome="$2" + + ok_count="$(fetch_counter "${suite}" "ok")" + failed_count="$(fetch_counter "${suite}" "failed")" + + if [ "${outcome}" = "ok" ]; then + ok_count=$((ok_count + 1)) + else + failed_count=$((failed_count + 1)) + fi + + cat </dev/null +# TYPE platform_quality_gate_runs_total counter +platform_quality_gate_runs_total{suite="${suite}",status="ok"} ${ok_count} +platform_quality_gate_runs_total{suite="${suite}",status="failed"} ${failed_count} +METRICS +} + +check_http_suite() { + suite="$1" + url="$2" + expected_code="$3" + body_match="${4:-}" + + body_file="$(mktemp)" + code="$(curl -ksS -m "${HTTP_TIMEOUT_SECONDS}" -o "${body_file}" -w '%{http_code}' "${url}" || true)" + + outcome="failed" + if [ "${code}" = "${expected_code}" ]; then + if [ -z "${body_match}" ] || grep -q -- "${body_match}" "${body_file}"; then + outcome="ok" + fi + fi + + rm -f "${body_file}" + push_suite_counters "${suite}" "${outcome}" + + if [ "${outcome}" = "ok" ]; then + printf '[probe] suite=%s outcome=ok url=%s\n' "${suite}" "${url}" + return 0 + fi + + printf '[probe] suite=%s outcome=failed url=%s code=%s\n' "${suite}" "${url}" "${code}" >&2 + return 1 +} + +failures=0 + +check_http_suite "atlasbot" "http://atlasbot.comms.svc.cluster.local:8090/health" "200" '"status": "ok"' || failures=$((failures + 1)) +check_http_suite "pegasus" "http://pegasus.jellyfin.svc.cluster.local/healthz" "200" || failures=$((failures + 1)) +check_http_suite "bstein-home" "http://bstein-dev-home-backend.bstein-dev-home.svc.cluster.local/api/healthz" "200" || failures=$((failures + 1)) + +if [ "${failures}" -gt 0 ]; then + printf '[probe] completed with %s suite failure(s)\n' "${failures}" >&2 +else + printf '[probe] completed with all suites passing\n' +fi + +# Report failures through metrics, not Job failure retries. +exit 0