monitoring: add suite probe metrics and align fan labels

This commit is contained in:
Brad Stein 2026-04-09 20:10:52 -03:00
parent 5e3aadc640
commit 530f440679
8 changed files with 183 additions and 51 deletions

View File

@ -1468,10 +1468,10 @@ def build_overview():
decimals=0,
text_mode="name_and_value",
targets=[
{"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Outlet", "instant": True},
{"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "In Inlet", "instant": True},
{"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Out Inlet", "instant": True},
{"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior", "instant": True},
{"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Inside Outlet", "instant": True},
{"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "Inside Inlet", "instant": True},
{"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Outside Inlet", "instant": True},
{"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior Fans", "instant": True},
],
thresholds={
"mode": "absolute",
@ -1493,10 +1493,10 @@ def build_overview():
unit="none",
max_value=10,
targets=[
{"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Outlet"},
{"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "In Inlet"},
{"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Out Inlet"},
{"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior"},
{"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Inside Outlet"},
{"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "Inside Inlet"},
{"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Outside Inlet"},
{"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior Fans"},
],
legend_display="list",
legend_placement="bottom",
@ -3223,10 +3223,10 @@ def build_power_dashboard():
decimals=0,
text_mode="name_and_value",
targets=[
{"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Outlet", "instant": True},
{"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "In Inlet", "instant": True},
{"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Out Inlet", "instant": True},
{"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior", "instant": True},
{"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Inside Outlet", "instant": True},
{"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "Inside Inlet", "instant": True},
{"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Outside Inlet", "instant": True},
{"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior Fans", "instant": True},
],
thresholds={
"mode": "absolute",
@ -3236,7 +3236,7 @@ def build_power_dashboard():
{"color": "red", "value": 9},
],
},
description="Current fan activity levels (0-10): outlet, inside inlet, outside inlet, and interior.",
description="Current fan activity levels (0-10): inside outlet, inside inlet, outside inlet, and interior fans.",
)
)
panels.append(
@ -3248,10 +3248,10 @@ def build_power_dashboard():
unit="none",
max_value=10,
targets=[
{"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Outlet"},
{"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "In Inlet"},
{"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Out Inlet"},
{"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior"},
{"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Inside Outlet"},
{"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "Inside Inlet"},
{"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Outside Inlet"},
{"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior Fans"},
],
legend_display="table",
legend_placement="right",

View File

@ -1542,25 +1542,25 @@
{
"refId": "A",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outlet\"}) or max(atlas_climate_fan_activity_level{position=\"outlet\"}) or on() vector(0))",
"legendFormat": "Outlet",
"legendFormat": "Inside Outlet",
"instant": true
},
{
"refId": "B",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"inside_inlet\"}) or on() vector(0))",
"legendFormat": "In Inlet",
"legendFormat": "Inside Inlet",
"instant": true
},
{
"refId": "C",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"outside_inlet\"}) or on() vector(0))",
"legendFormat": "Out Inlet",
"legendFormat": "Outside Inlet",
"instant": true
},
{
"refId": "D",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"interior\"}) or max(atlas_climate_fan_activity_level{position=\"interior\"}) or on() vector(0))",
"legendFormat": "Interior",
"legendFormat": "Interior Fans",
"instant": true
}
],
@ -1634,22 +1634,22 @@
{
"refId": "A",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"outlet\"} or atlas_climate_fan_activity_level{position=\"outlet\"})",
"legendFormat": "Outlet"
"legendFormat": "Inside Outlet"
},
{
"refId": "B",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"} or atlas_climate_fan_activity_level{position=\"inside_inlet\"})",
"legendFormat": "In Inlet"
"legendFormat": "Inside Inlet"
},
{
"refId": "C",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"} or atlas_climate_fan_activity_level{position=\"outside_inlet\"})",
"legendFormat": "Out Inlet"
"legendFormat": "Outside Inlet"
},
{
"refId": "D",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"interior\"} or atlas_climate_fan_activity_level{position=\"interior\"})",
"legendFormat": "Interior"
"legendFormat": "Interior Fans"
}
],
"fieldConfig": {

View File

@ -448,25 +448,25 @@
{
"refId": "A",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outlet\"}) or max(atlas_climate_fan_activity_level{position=\"outlet\"}) or on() vector(0))",
"legendFormat": "Outlet",
"legendFormat": "Inside Outlet",
"instant": true
},
{
"refId": "B",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"inside_inlet\"}) or on() vector(0))",
"legendFormat": "In Inlet",
"legendFormat": "Inside Inlet",
"instant": true
},
{
"refId": "C",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"outside_inlet\"}) or on() vector(0))",
"legendFormat": "Out Inlet",
"legendFormat": "Outside Inlet",
"instant": true
},
{
"refId": "D",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"interior\"}) or max(atlas_climate_fan_activity_level{position=\"interior\"}) or on() vector(0))",
"legendFormat": "Interior",
"legendFormat": "Interior Fans",
"instant": true
}
],
@ -514,7 +514,7 @@
},
"textMode": "name_and_value"
},
"description": "Current fan activity levels (0-10): outlet, inside inlet, outside inlet, and interior."
"description": "Current fan activity levels (0-10): inside outlet, inside inlet, outside inlet, and interior fans."
},
{
"id": 6,
@ -534,22 +534,22 @@
{
"refId": "A",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"outlet\"} or atlas_climate_fan_activity_level{position=\"outlet\"})",
"legendFormat": "Outlet"
"legendFormat": "Inside Outlet"
},
{
"refId": "B",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"} or atlas_climate_fan_activity_level{position=\"inside_inlet\"})",
"legendFormat": "In Inlet"
"legendFormat": "Inside Inlet"
},
{
"refId": "C",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"} or atlas_climate_fan_activity_level{position=\"outside_inlet\"})",
"legendFormat": "Out Inlet"
"legendFormat": "Outside Inlet"
},
{
"refId": "D",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"interior\"} or atlas_climate_fan_activity_level{position=\"interior\"})",
"legendFormat": "Interior"
"legendFormat": "Interior Fans"
}
],
"fieldConfig": {

View File

@ -1551,25 +1551,25 @@ data:
{
"refId": "A",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outlet\"}) or max(atlas_climate_fan_activity_level{position=\"outlet\"}) or on() vector(0))",
"legendFormat": "Outlet",
"legendFormat": "Inside Outlet",
"instant": true
},
{
"refId": "B",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"inside_inlet\"}) or on() vector(0))",
"legendFormat": "In Inlet",
"legendFormat": "Inside Inlet",
"instant": true
},
{
"refId": "C",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"outside_inlet\"}) or on() vector(0))",
"legendFormat": "Out Inlet",
"legendFormat": "Outside Inlet",
"instant": true
},
{
"refId": "D",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"interior\"}) or max(atlas_climate_fan_activity_level{position=\"interior\"}) or on() vector(0))",
"legendFormat": "Interior",
"legendFormat": "Interior Fans",
"instant": true
}
],
@ -1643,22 +1643,22 @@ data:
{
"refId": "A",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"outlet\"} or atlas_climate_fan_activity_level{position=\"outlet\"})",
"legendFormat": "Outlet"
"legendFormat": "Inside Outlet"
},
{
"refId": "B",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"} or atlas_climate_fan_activity_level{position=\"inside_inlet\"})",
"legendFormat": "In Inlet"
"legendFormat": "Inside Inlet"
},
{
"refId": "C",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"} or atlas_climate_fan_activity_level{position=\"outside_inlet\"})",
"legendFormat": "Out Inlet"
"legendFormat": "Outside Inlet"
},
{
"refId": "D",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"interior\"} or atlas_climate_fan_activity_level{position=\"interior\"})",
"legendFormat": "Interior"
"legendFormat": "Interior Fans"
}
],
"fieldConfig": {

View File

@ -457,25 +457,25 @@ data:
{
"refId": "A",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outlet\"}) or max(atlas_climate_fan_activity_level{position=\"outlet\"}) or on() vector(0))",
"legendFormat": "Outlet",
"legendFormat": "Inside Outlet",
"instant": true
},
{
"refId": "B",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"inside_inlet\"}) or on() vector(0))",
"legendFormat": "In Inlet",
"legendFormat": "Inside Inlet",
"instant": true
},
{
"refId": "C",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"outside_inlet\"}) or on() vector(0))",
"legendFormat": "Out Inlet",
"legendFormat": "Outside Inlet",
"instant": true
},
{
"refId": "D",
"expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"interior\"}) or max(atlas_climate_fan_activity_level{position=\"interior\"}) or on() vector(0))",
"legendFormat": "Interior",
"legendFormat": "Interior Fans",
"instant": true
}
],
@ -523,7 +523,7 @@ data:
},
"textMode": "name_and_value"
},
"description": "Current fan activity levels (0-10): outlet, inside inlet, outside inlet, and interior."
"description": "Current fan activity levels (0-10): inside outlet, inside inlet, outside inlet, and interior fans."
},
{
"id": 6,
@ -543,22 +543,22 @@ data:
{
"refId": "A",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"outlet\"} or atlas_climate_fan_activity_level{position=\"outlet\"})",
"legendFormat": "Outlet"
"legendFormat": "Inside Outlet"
},
{
"refId": "B",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"} or atlas_climate_fan_activity_level{position=\"inside_inlet\"})",
"legendFormat": "In Inlet"
"legendFormat": "Inside Inlet"
},
{
"refId": "C",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"} or atlas_climate_fan_activity_level{position=\"outside_inlet\"})",
"legendFormat": "Out Inlet"
"legendFormat": "Outside Inlet"
},
{
"refId": "D",
"expr": "(atlas_climate_fan_activity_level{fan_group=\"interior\"} or atlas_climate_fan_activity_level{position=\"interior\"})",
"legendFormat": "Interior"
"legendFormat": "Interior Fans"
}
],
"fieldConfig": {

View File

@ -23,6 +23,7 @@ resources:
- platform-quality-gateway-pvc.yaml
- platform-quality-gateway-service.yaml
- platform-quality-gateway-deployment.yaml
- platform-quality-suite-probe-cronjob.yaml
- vault-sync-deployment.yaml
- grafana-alerting-config.yaml
- grafana-folders.yaml
@ -49,3 +50,9 @@ configMapGenerator:
- scripts/vault-entrypoint.sh
options:
disableNameSuffixHash: true
- name: platform-quality-suite-probe-script
namespace: monitoring
files:
- platform_quality_suite_probe.sh=scripts/platform_quality_suite_probe.sh
options:
disableNameSuffixHash: true

View File

@ -0,0 +1,39 @@
# services/monitoring/platform-quality-suite-probe-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: platform-quality-suite-probe
namespace: monitoring
spec:
schedule: "*/15 * * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 2
failedJobsHistoryLimit: 2
jobTemplate:
spec:
backoffLimit: 0
template:
metadata:
labels:
app: platform-quality-suite-probe
spec:
restartPolicy: Never
containers:
- name: probe
image: curlimages/curl:8.12.1
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "/scripts/platform_quality_suite_probe.sh"]
env:
- name: PUSHGATEWAY_URL
value: http://platform-quality-gateway.monitoring.svc.cluster.local:9091
- name: HTTP_TIMEOUT_SECONDS
value: "12"
volumeMounts:
- name: probe-script
mountPath: /scripts
readOnly: true
volumes:
- name: probe-script
configMap:
name: platform-quality-suite-probe-script
defaultMode: 0555

View File

@ -0,0 +1,86 @@
#!/usr/bin/env sh
set -eu
PUSHGATEWAY_URL="${PUSHGATEWAY_URL:-http://platform-quality-gateway.monitoring.svc.cluster.local:9091}"
HTTP_TIMEOUT_SECONDS="${HTTP_TIMEOUT_SECONDS:-12}"
fetch_counter() {
suite="$1"
status="$2"
line="$(curl -fsS "${PUSHGATEWAY_URL}/metrics" 2>/dev/null | awk -v suite="$suite" -v status="$status" '
/^platform_quality_gate_runs_total\{/ {
if (index($0, "suite=\"" suite "\"") && index($0, "status=\"" status "\"")) {
print $0
exit
}
}
' || true)"
if [ -z "${line}" ]; then
printf '0\n'
return 0
fi
printf '%s\n' "${line}" | awk '{print $2 + 0}'
}
push_suite_counters() {
suite="$1"
outcome="$2"
ok_count="$(fetch_counter "${suite}" "ok")"
failed_count="$(fetch_counter "${suite}" "failed")"
if [ "${outcome}" = "ok" ]; then
ok_count=$((ok_count + 1))
else
failed_count=$((failed_count + 1))
fi
cat <<METRICS | curl -fsS --data-binary @- "${PUSHGATEWAY_URL}/metrics/job/platform-quality-suite-probe/suite/${suite}" >/dev/null
# TYPE platform_quality_gate_runs_total counter
platform_quality_gate_runs_total{suite="${suite}",status="ok"} ${ok_count}
platform_quality_gate_runs_total{suite="${suite}",status="failed"} ${failed_count}
METRICS
}
check_http_suite() {
suite="$1"
url="$2"
expected_code="$3"
body_match="${4:-}"
body_file="$(mktemp)"
code="$(curl -ksS -m "${HTTP_TIMEOUT_SECONDS}" -o "${body_file}" -w '%{http_code}' "${url}" || true)"
outcome="failed"
if [ "${code}" = "${expected_code}" ]; then
if [ -z "${body_match}" ] || grep -q -- "${body_match}" "${body_file}"; then
outcome="ok"
fi
fi
rm -f "${body_file}"
push_suite_counters "${suite}" "${outcome}"
if [ "${outcome}" = "ok" ]; then
printf '[probe] suite=%s outcome=ok url=%s\n' "${suite}" "${url}"
return 0
fi
printf '[probe] suite=%s outcome=failed url=%s code=%s\n' "${suite}" "${url}" "${code}" >&2
return 1
}
failures=0
check_http_suite "atlasbot" "http://atlasbot.comms.svc.cluster.local:8090/health" "200" '"status": "ok"' || failures=$((failures + 1))
check_http_suite "pegasus" "http://pegasus.jellyfin.svc.cluster.local/healthz" "200" || failures=$((failures + 1))
check_http_suite "bstein-home" "http://bstein-dev-home-backend.bstein-dev-home.svc.cluster.local/api/healthz" "200" || failures=$((failures + 1))
if [ "${failures}" -gt 0 ]; then
printf '[probe] completed with %s suite failure(s)\n' "${failures}" >&2
else
printf '[probe] completed with all suites passing\n'
fi
# Report failures through metrics, not Job failure retries.
exit 0