monitoring/recovery: harden ananke checks and OIDC-gated service validation

This commit is contained in:
Brad Stein 2026-04-09 01:41:02 -03:00
parent e0b124ca4e
commit 764bfe189e
9 changed files with 137 additions and 19 deletions

View File

@ -78,6 +78,7 @@ spec:
- --upstream=http://longhorn-frontend.longhorn-system.svc.cluster.local - --upstream=http://longhorn-frontend.longhorn-system.svc.cluster.local
- --http-address=0.0.0.0:4180 - --http-address=0.0.0.0:4180
- --skip-provider-button=true - --skip-provider-button=true
- --approval-prompt=auto
- --skip-jwt-bearer-tokens=true - --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups - --oidc-groups-claim=groups
- --cookie-domain=longhorn.bstein.dev - --cookie-domain=longhorn.bstein.dev

View File

@ -127,6 +127,13 @@ STARTUP_INGRESS_ALLOWED_STATUSES="${STARTUP_INGRESS_ALLOWED_STATUSES:-200,301,30
STARTUP_IGNORE_INGRESS_HOSTS_REGEX="${STARTUP_IGNORE_INGRESS_HOSTS_REGEX:-}" STARTUP_IGNORE_INGRESS_HOSTS_REGEX="${STARTUP_IGNORE_INGRESS_HOSTS_REGEX:-}"
STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="${STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS:-10}" STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="${STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS:-10}"
SHUTDOWN_NAMESPACE_EXCLUDES_REGEX="${SHUTDOWN_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$}" SHUTDOWN_NAMESPACE_EXCLUDES_REGEX="${SHUTDOWN_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$}"
REQUIRE_NONEMPTY_REPLICA_SNAPSHOT="${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT:-1}"
STARTUP_REQUIRE_MAIL_SAFEGUARDS="${STARTUP_REQUIRE_MAIL_SAFEGUARDS:-1}"
MAIL_STARTUP_NAMESPACE="${MAIL_STARTUP_NAMESPACE:-mailu-mailserver}"
MAIL_STARTUP_ENDPOINT_SERVICES="${MAIL_STARTUP_ENDPOINT_SERVICES:-mailu-front,mailu-postfix,mailu-dovecot}"
MAIL_STARTUP_HOST="${MAIL_STARTUP_HOST:-mail.bstein.dev}"
MAIL_STARTUP_TCP_PORTS="${MAIL_STARTUP_TCP_PORTS:-25,465,587,993,995}"
MAIL_STARTUP_TCP_TIMEOUT_SECONDS="${MAIL_STARTUP_TCP_TIMEOUT_SECONDS:-3}"
BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}" BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}"
STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/ananke}" STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/ananke}"
RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state" RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state"
@ -656,6 +663,46 @@ service_status_allowed() {
return 1 return 1
} }
check_mail_safeguards_once() {
local quiet="${1:-0}"
local failures=0 namespace service host port ips
local -a services=() ports=()
if [[ "${STARTUP_REQUIRE_MAIL_SAFEGUARDS}" != "1" && "${STARTUP_REQUIRE_MAIL_SAFEGUARDS}" != "true" ]]; then
return 0
fi
namespace="${MAIL_STARTUP_NAMESPACE}"
as_array_from_csv "${MAIL_STARTUP_ENDPOINT_SERVICES}" services
for service in "${services[@]}"; do
service="${service//[[:space:]]/}"
[[ -n "${service}" ]] || continue
ips="$(kubectl -n "${namespace}" get endpoints "${service}" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null || true)"
if [[ -z "${ips//[[:space:]]/}" ]]; then
if [[ "${quiet}" != "1" ]]; then
warn "startup-check mail-endpoints ${namespace}/${service}: no ready endpoints."
fi
failures=1
fi
done
host="${MAIL_STARTUP_HOST}"
if [[ -n "${host}" ]]; then
as_array_from_csv "${MAIL_STARTUP_TCP_PORTS}" ports
for port in "${ports[@]}"; do
port="${port//[[:space:]]/}"
[[ "${port}" =~ ^[0-9]+$ ]] || continue
if ! timeout "${MAIL_STARTUP_TCP_TIMEOUT_SECONDS}" bash -lc "</dev/tcp/${host}/${port}" >/dev/null 2>&1; then
if [[ "${quiet}" != "1" ]]; then
warn "startup-check mail-tcp ${host}:${port}: connect failed."
fi
failures=1
fi
done
fi
(( failures == 0 ))
}
check_startup_service_checklist_once() { check_startup_service_checklist_once() {
local rows row name url expected body_must body_must_not insecure timeout code rc local rows row name url expected body_must body_must_not insecure timeout code rc
local body_file failures local body_file failures
@ -700,6 +747,9 @@ check_startup_service_checklist_once() {
fi fi
rm -f "${body_file}" rm -f "${body_file}"
done <<< "${rows}" done <<< "${rows}"
if ! check_mail_safeguards_once; then
failures=1
fi
(( failures == 0 )) (( failures == 0 ))
} }
@ -887,7 +937,15 @@ save_workload_replica_snapshot() {
printf '%s\t%s\t%s\t%s\n' "${ns}" "${kind}" "${name}" "${replicas}" >> "${REPLICA_SNAPSHOT_FILE}" printf '%s\t%s\t%s\t%s\n' "${ns}" "${kind}" "${name}" "${replicas}" >> "${REPLICA_SNAPSHOT_FILE}"
done <<< "${rows}" done <<< "${rows}"
log "replica-snapshot-file=${REPLICA_SNAPSHOT_FILE}" log "replica-snapshot-file=${REPLICA_SNAPSHOT_FILE}"
log "replica-snapshot-count=$(wc -l < "${REPLICA_SNAPSHOT_FILE}" | tr -d ' ')" log "replica-snapshot-count=$(replica_snapshot_count)"
}
replica_snapshot_count() {
if [[ -f "${REPLICA_SNAPSHOT_FILE}" ]]; then
wc -l < "${REPLICA_SNAPSHOT_FILE}" | tr -d ' '
else
printf '0'
fi
} }
restore_workload_replica_snapshot() { restore_workload_replica_snapshot() {
@ -915,6 +973,35 @@ restore_workload_replica_snapshot() {
mark_checkpoint startup_replicas_restored mark_checkpoint startup_replicas_restored
} }
restore_zero_scaled_helm_workloads() {
local rows ns kind name
local restored=0
rows="$(
{
kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\.helm\\.sh/release-name --no-headers 2>/dev/null \
| awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "<none>" {printf "%s\tdeployment\t%s\n", $1, $2}'
kubectl get statefulset -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\.helm\\.sh/release-name --no-headers 2>/dev/null \
| awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "<none>" {printf "%s\tstatefulset\t%s\n", $1, $2}'
} | sed '/^[[:space:]]*$/d'
)"
while IFS=$'\t' read -r ns kind name; do
[[ -n "${ns}" && -n "${kind}" && -n "${name}" ]] || continue
startup_workload_namespace_excluded "${ns}" && continue
if [[ -n "${STARTUP_IGNORE_WORKLOADS_REGEX}" ]] && [[ "${ns}/${name}" =~ ${STARTUP_IGNORE_WORKLOADS_REGEX} ]]; then
continue
fi
warn "Auto-heal: restoring zero-scaled Helm workload ${ns}/${kind}/${name} to replicas=1."
run kubectl -n "${ns}" scale "${kind}" "${name}" --replicas=1
restored=$((restored + 1))
done <<< "${rows}"
if (( restored > 0 )); then
log "Auto-heal: restored ${restored} zero-scaled Helm workloads."
mark_checkpoint startup_zero_scaled_helm_restored
else
log "Auto-heal: no zero-scaled Helm workloads detected."
fi
}
list_unhealthy_workloads() { list_unhealthy_workloads() {
local rows line ns name desired ready available local rows line ns name desired ready available
rows="$(kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas --no-headers 2>/dev/null || true)" rows="$(kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas --no-headers 2>/dev/null || true)"
@ -1548,6 +1635,7 @@ resume_flux_and_reconcile() {
status_report() { status_report() {
local battery flux_ready flux_url flux_branch flux_url_drift flux_branch_drift harbor_code workers ingress_hosts_count local battery flux_ready flux_url flux_branch flux_url_drift flux_branch_drift harbor_code workers ingress_hosts_count
local mail_safeguards_ok
local effective_target effective_canary local effective_target effective_canary
local labeled_nodes local labeled_nodes
battery="$(read_ups_battery || true)" battery="$(read_ups_battery || true)"
@ -1600,6 +1688,15 @@ status_report() {
echo "flux_source_branch_drift=${flux_branch_drift}" echo "flux_source_branch_drift=${flux_branch_drift}"
echo "flux_source_ready=${flux_ready:-unknown}" echo "flux_source_ready=${flux_ready:-unknown}"
echo "ingress_hosts_count=${ingress_hosts_count}" echo "ingress_hosts_count=${ingress_hosts_count}"
if check_mail_safeguards_once 1; then
mail_safeguards_ok=true
else
mail_safeguards_ok=false
fi
echo "mail_startup_safeguards_required=${STARTUP_REQUIRE_MAIL_SAFEGUARDS}"
echo "mail_startup_safeguards_ok=${mail_safeguards_ok}"
echo "mail_startup_host=${MAIL_STARTUP_HOST}"
echo "mail_startup_ports=${MAIL_STARTUP_TCP_PORTS}"
echo "harbor_http=${harbor_code:-unknown}" echo "harbor_http=${harbor_code:-unknown}"
kubectl get ingressclass traefik >/dev/null 2>&1 && echo "traefik_ingressclass=true" || echo "traefik_ingressclass=false" kubectl get ingressclass traefik >/dev/null 2>&1 && echo "traefik_ingressclass=true" || echo "traefik_ingressclass=false"
kubectl -n traefik get deploy traefik >/dev/null 2>&1 && echo "traefik_deploy=true" || echo "traefik_deploy=false" kubectl -n traefik get deploy traefik >/dev/null 2>&1 && echo "traefik_deploy=true" || echo "traefik_deploy=false"
@ -1636,6 +1733,16 @@ planned_shutdown() {
fi fi
save_workload_replica_snapshot save_workload_replica_snapshot
if [[ "${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT}" == "1" || "${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT}" == "true" ]]; then
local replica_count
replica_count="$(replica_snapshot_count)"
if [[ ! "${replica_count}" =~ ^[0-9]+$ ]]; then
replica_count=0
fi
if (( replica_count == 0 )); then
die "Replica snapshot is empty at ${REPLICA_SNAPSHOT_FILE}; refusing shutdown to avoid startup restore deadlock."
fi
fi
mark_checkpoint shutdown_replicas_snapshot mark_checkpoint shutdown_replicas_snapshot
patch_flux_suspend_all true patch_flux_suspend_all true
@ -1772,6 +1879,7 @@ startup_flow() {
resume_flux_and_reconcile resume_flux_and_reconcile
wait_for_flux_kustomizations_ready wait_for_flux_kustomizations_ready
restore_workload_replica_snapshot restore_workload_replica_snapshot
restore_zero_scaled_helm_workloads
wait_for_startup_workloads_ready wait_for_startup_workloads_ready
wait_for_startup_service_checklist wait_for_startup_service_checklist
wait_for_startup_stability_window wait_for_startup_stability_window

View File

@ -422,12 +422,14 @@ ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
PLATFORM_TEST_SUCCESS_EVENTS_30D = ( PLATFORM_TEST_SUCCESS_EVENTS_30D = (
'(sum(increase(ariadne_task_runs_total{status="ok"}[30d])) or on() vector(0)) + ' '(sum(increase(ariadne_task_runs_total{status="ok"}[30d])) or on() vector(0)) + '
'(sum(increase(metis_builds_total{status="ok"}[30d])) or on() vector(0)) + ' '(sum(increase(metis_builds_total{status="ok"}[30d])) or on() vector(0)) + '
'(sum(increase(metis_flashes_total{status="ok"}[30d])) or on() vector(0))' '(sum(increase(metis_flashes_total{status="ok"}[30d])) or on() vector(0)) + '
'(sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[30d])) or on() vector(0))'
) )
PLATFORM_TEST_TOTAL_EVENTS_30D = ( PLATFORM_TEST_TOTAL_EVENTS_30D = (
"(sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + " "(sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + "
"(sum(increase(metis_builds_total[30d])) or on() vector(0)) + " "(sum(increase(metis_builds_total[30d])) or on() vector(0)) + "
"(sum(increase(metis_flashes_total[30d])) or on() vector(0))" "(sum(increase(metis_flashes_total[30d])) or on() vector(0)) + "
"(sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])) or on() vector(0))"
) )
TEST_SUCCESS_RATE = ( TEST_SUCCESS_RATE = (
f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_30D}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_30D}), 1)" f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_30D}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_30D}), 1)"
@ -435,12 +437,14 @@ TEST_SUCCESS_RATE = (
TEST_FAILURES_24H_TOTAL = ( TEST_FAILURES_24H_TOTAL = (
'(sum(increase(ariadne_task_runs_total{status!="ok"}[24h])) or on() vector(0)) + ' '(sum(increase(ariadne_task_runs_total{status!="ok"}[24h])) or on() vector(0)) + '
'(sum(increase(metis_builds_total{status="error"}[24h])) or on() vector(0)) + ' '(sum(increase(metis_builds_total{status="error"}[24h])) or on() vector(0)) + '
'(sum(increase(metis_flashes_total{status="error"}[24h])) or on() vector(0))' '(sum(increase(metis_flashes_total{status="error"}[24h])) or on() vector(0)) + '
'(sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="failed"}[24h])) or on() vector(0))'
) )
PLATFORM_TEST_ACTIVITY_30D = ( PLATFORM_TEST_ACTIVITY_30D = (
'label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), "source", "ariadne", "__name__", ".*") ' 'label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), "source", "ariadne", "__name__", ".*") '
'or label_replace(sum by (status) (increase(metis_builds_total[30d])), "source", "metis-build", "__name__", ".*") ' 'or label_replace(sum by (status) (increase(metis_builds_total[30d])), "source", "metis-build", "__name__", ".*") '
'or label_replace(sum by (status) (increase(metis_flashes_total[30d])), "source", "metis-flash", "__name__", ".*")' 'or label_replace(sum by (status) (increase(metis_flashes_total[30d])), "source", "metis-flash", "__name__", ".*") '
'or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite="ananke"}[30d])), "source", "ananke-quality", "__name__", ".*")'
) )
PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES = ( PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES = (
'label_replace(100 * (sum(increase(ariadne_task_runs_total{status="ok"}[$__interval])) or on() vector(0)) ' 'label_replace(100 * (sum(increase(ariadne_task_runs_total{status="ok"}[$__interval])) or on() vector(0)) '
@ -451,7 +455,10 @@ PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES = (
'"suite", "metis-build", "__name__", ".*") ' '"suite", "metis-build", "__name__", ".*") '
'or label_replace(100 * (sum(increase(metis_flashes_total{status="ok"}[$__interval])) or on() vector(0)) ' 'or label_replace(100 * (sum(increase(metis_flashes_total{status="ok"}[$__interval])) or on() vector(0)) '
'/ clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), ' '/ clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), '
'"suite", "metis-flash", "__name__", ".*")' '"suite", "metis-flash", "__name__", ".*") '
'or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[$__interval])) or on() vector(0)) '
'/ clamp_min((sum(increase(ananke_quality_gate_runs_total{suite="ananke"}[$__interval])) or on() vector(0)), 1), '
'"suite", "ananke-quality", "__name__", ".*")'
) )
ANANKE_SELECTOR = 'job="ananke-power"' ANANKE_SELECTOR = 'job="ananke-power"'
ANANKE_UPS_DB_NAME = "Pyrphoros" ANANKE_UPS_DB_NAME = "Pyrphoros"
@ -1501,7 +1508,7 @@ def build_overview():
links=link_to("atlas-jobs"), links=link_to("atlas-jobs"),
) )
test_success["description"] = ( test_success["description"] = (
"Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines). " "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). "
"Each line tracks pass percentage over time for its suite." "Each line tracks pass percentage over time for its suite."
) )
panels.append(test_success) panels.append(test_success)

View File

@ -82,6 +82,7 @@ spec:
- --upstream=http://opensearch-dashboards.logging.svc.cluster.local:5601 - --upstream=http://opensearch-dashboards.logging.svc.cluster.local:5601
- --http-address=0.0.0.0:4180 - --http-address=0.0.0.0:4180
- --skip-provider-button=true - --skip-provider-button=true
- --approval-prompt=auto
- --skip-jwt-bearer-tokens=true - --skip-jwt-bearer-tokens=true
- --cookie-domain=logs.bstein.dev - --cookie-domain=logs.bstein.dev
env: env:

View File

@ -93,6 +93,7 @@ spec:
- --upstream=http://metis.maintenance.svc.cluster.local - --upstream=http://metis.maintenance.svc.cluster.local
- --http-address=0.0.0.0:4180 - --http-address=0.0.0.0:4180
- --skip-provider-button=true - --skip-provider-button=true
- --approval-prompt=auto
- --skip-jwt-bearer-tokens=true - --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups - --oidc-groups-claim=groups
- --cookie-domain=sentinel.bstein.dev - --cookie-domain=sentinel.bstein.dev

View File

@ -1138,7 +1138,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0))), 1)", "expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])) or on() vector(0))), 1)",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }
@ -1201,7 +1201,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\")", "expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])), \"source\", \"ananke-quality\", \"__name__\", \".*\")",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }
@ -1253,7 +1253,7 @@
"targets": [ "targets": [
{ {
"refId": "A", "refId": "A",
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")", "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")",
"legendFormat": "{{suite}}" "legendFormat": "{{suite}}"
} }
], ],

View File

@ -1820,7 +1820,7 @@
"targets": [ "targets": [
{ {
"refId": "A", "refId": "A",
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")", "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")",
"legendFormat": "{{suite}}" "legendFormat": "{{suite}}"
} }
], ],
@ -1846,7 +1846,7 @@
"targetBlank": true "targetBlank": true
} }
], ],
"description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines). Each line tracks pass percentage over time for its suite." "description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). Each line tracks pass percentage over time for its suite."
}, },
{ {
"id": 47, "id": 47,
@ -1864,7 +1864,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0))", "expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h])) or on() vector(0))",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }

View File

@ -1147,7 +1147,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0))), 1)", "expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])) or on() vector(0))), 1)",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }
@ -1210,7 +1210,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\")", "expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])), \"source\", \"ananke-quality\", \"__name__\", \".*\")",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }
@ -1262,7 +1262,7 @@ data:
"targets": [ "targets": [
{ {
"refId": "A", "refId": "A",
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")", "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")",
"legendFormat": "{{suite}}" "legendFormat": "{{suite}}"
} }
], ],

View File

@ -1829,7 +1829,7 @@ data:
"targets": [ "targets": [
{ {
"refId": "A", "refId": "A",
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")", "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")",
"legendFormat": "{{suite}}" "legendFormat": "{{suite}}"
} }
], ],
@ -1855,7 +1855,7 @@ data:
"targetBlank": true "targetBlank": true
} }
], ],
"description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines). Each line tracks pass percentage over time for its suite." "description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). Each line tracks pass percentage over time for its suite."
}, },
{ {
"id": 47, "id": 47,
@ -1873,7 +1873,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0))", "expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h])) or on() vector(0))",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }