From 764bfe189ea564464fe4a92e5abc02d82b756e0a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 9 Apr 2026 01:41:02 -0300 Subject: [PATCH] monitoring/recovery: harden ananke checks and OIDC-gated service validation --- .../ui-ingress/oauth2-proxy-longhorn.yaml | 1 + scripts/cluster_power_recovery.sh | 110 +++++++++++++++++- scripts/dashboards_render_atlas.py | 19 ++- services/logging/oauth2-proxy.yaml | 1 + services/maintenance/oauth2-proxy-metis.yaml | 1 + .../monitoring/dashboards/atlas-jobs.json | 6 +- .../monitoring/dashboards/atlas-overview.json | 6 +- .../monitoring/grafana-dashboard-jobs.yaml | 6 +- .../grafana-dashboard-overview.yaml | 6 +- 9 files changed, 137 insertions(+), 19 deletions(-) diff --git a/infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml b/infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml index a730e314..310f0d17 100644 --- a/infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml +++ b/infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml @@ -78,6 +78,7 @@ spec: - --upstream=http://longhorn-frontend.longhorn-system.svc.cluster.local - --http-address=0.0.0.0:4180 - --skip-provider-button=true + - --approval-prompt=auto - --skip-jwt-bearer-tokens=true - --oidc-groups-claim=groups - --cookie-domain=longhorn.bstein.dev diff --git a/scripts/cluster_power_recovery.sh b/scripts/cluster_power_recovery.sh index 0081a0e7..6a7a9002 100755 --- a/scripts/cluster_power_recovery.sh +++ b/scripts/cluster_power_recovery.sh @@ -127,6 +127,13 @@ STARTUP_INGRESS_ALLOWED_STATUSES="${STARTUP_INGRESS_ALLOWED_STATUSES:-200,301,30 STARTUP_IGNORE_INGRESS_HOSTS_REGEX="${STARTUP_IGNORE_INGRESS_HOSTS_REGEX:-}" STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="${STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS:-10}" SHUTDOWN_NAMESPACE_EXCLUDES_REGEX="${SHUTDOWN_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$}" +REQUIRE_NONEMPTY_REPLICA_SNAPSHOT="${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT:-1}" +STARTUP_REQUIRE_MAIL_SAFEGUARDS="${STARTUP_REQUIRE_MAIL_SAFEGUARDS:-1}" +MAIL_STARTUP_NAMESPACE="${MAIL_STARTUP_NAMESPACE:-mailu-mailserver}" +MAIL_STARTUP_ENDPOINT_SERVICES="${MAIL_STARTUP_ENDPOINT_SERVICES:-mailu-front,mailu-postfix,mailu-dovecot}" +MAIL_STARTUP_HOST="${MAIL_STARTUP_HOST:-mail.bstein.dev}" +MAIL_STARTUP_TCP_PORTS="${MAIL_STARTUP_TCP_PORTS:-25,465,587,993,995}" +MAIL_STARTUP_TCP_TIMEOUT_SECONDS="${MAIL_STARTUP_TCP_TIMEOUT_SECONDS:-3}" BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}" STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/ananke}" RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state" @@ -656,6 +663,46 @@ service_status_allowed() { return 1 } +check_mail_safeguards_once() { + local quiet="${1:-0}" + local failures=0 namespace service host port ips + local -a services=() ports=() + if [[ "${STARTUP_REQUIRE_MAIL_SAFEGUARDS}" != "1" && "${STARTUP_REQUIRE_MAIL_SAFEGUARDS}" != "true" ]]; then + return 0 + fi + + namespace="${MAIL_STARTUP_NAMESPACE}" + as_array_from_csv "${MAIL_STARTUP_ENDPOINT_SERVICES}" services + for service in "${services[@]}"; do + service="${service//[[:space:]]/}" + [[ -n "${service}" ]] || continue + ips="$(kubectl -n "${namespace}" get endpoints "${service}" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null || true)" + if [[ -z "${ips//[[:space:]]/}" ]]; then + if [[ "${quiet}" != "1" ]]; then + warn "startup-check mail-endpoints ${namespace}/${service}: no ready endpoints." + fi + failures=1 + fi + done + + host="${MAIL_STARTUP_HOST}" + if [[ -n "${host}" ]]; then + as_array_from_csv "${MAIL_STARTUP_TCP_PORTS}" ports + for port in "${ports[@]}"; do + port="${port//[[:space:]]/}" + [[ "${port}" =~ ^[0-9]+$ ]] || continue + if ! timeout "${MAIL_STARTUP_TCP_TIMEOUT_SECONDS}" bash -lc "/dev/null 2>&1; then + if [[ "${quiet}" != "1" ]]; then + warn "startup-check mail-tcp ${host}:${port}: connect failed." + fi + failures=1 + fi + done + fi + + (( failures == 0 )) +} + check_startup_service_checklist_once() { local rows row name url expected body_must body_must_not insecure timeout code rc local body_file failures @@ -700,6 +747,9 @@ check_startup_service_checklist_once() { fi rm -f "${body_file}" done <<< "${rows}" + if ! check_mail_safeguards_once; then + failures=1 + fi (( failures == 0 )) } @@ -887,7 +937,15 @@ save_workload_replica_snapshot() { printf '%s\t%s\t%s\t%s\n' "${ns}" "${kind}" "${name}" "${replicas}" >> "${REPLICA_SNAPSHOT_FILE}" done <<< "${rows}" log "replica-snapshot-file=${REPLICA_SNAPSHOT_FILE}" - log "replica-snapshot-count=$(wc -l < "${REPLICA_SNAPSHOT_FILE}" | tr -d ' ')" + log "replica-snapshot-count=$(replica_snapshot_count)" +} + +replica_snapshot_count() { + if [[ -f "${REPLICA_SNAPSHOT_FILE}" ]]; then + wc -l < "${REPLICA_SNAPSHOT_FILE}" | tr -d ' ' + else + printf '0' + fi } restore_workload_replica_snapshot() { @@ -915,6 +973,35 @@ restore_workload_replica_snapshot() { mark_checkpoint startup_replicas_restored } +restore_zero_scaled_helm_workloads() { + local rows ns kind name + local restored=0 + rows="$( + { + kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\.helm\\.sh/release-name --no-headers 2>/dev/null \ + | awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "" {printf "%s\tdeployment\t%s\n", $1, $2}' + kubectl get statefulset -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\.helm\\.sh/release-name --no-headers 2>/dev/null \ + | awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "" {printf "%s\tstatefulset\t%s\n", $1, $2}' + } | sed '/^[[:space:]]*$/d' + )" + while IFS=$'\t' read -r ns kind name; do + [[ -n "${ns}" && -n "${kind}" && -n "${name}" ]] || continue + startup_workload_namespace_excluded "${ns}" && continue + if [[ -n "${STARTUP_IGNORE_WORKLOADS_REGEX}" ]] && [[ "${ns}/${name}" =~ ${STARTUP_IGNORE_WORKLOADS_REGEX} ]]; then + continue + fi + warn "Auto-heal: restoring zero-scaled Helm workload ${ns}/${kind}/${name} to replicas=1." + run kubectl -n "${ns}" scale "${kind}" "${name}" --replicas=1 + restored=$((restored + 1)) + done <<< "${rows}" + if (( restored > 0 )); then + log "Auto-heal: restored ${restored} zero-scaled Helm workloads." + mark_checkpoint startup_zero_scaled_helm_restored + else + log "Auto-heal: no zero-scaled Helm workloads detected." + fi +} + list_unhealthy_workloads() { local rows line ns name desired ready available rows="$(kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas --no-headers 2>/dev/null || true)" @@ -1548,6 +1635,7 @@ resume_flux_and_reconcile() { status_report() { local battery flux_ready flux_url flux_branch flux_url_drift flux_branch_drift harbor_code workers ingress_hosts_count + local mail_safeguards_ok local effective_target effective_canary local labeled_nodes battery="$(read_ups_battery || true)" @@ -1600,6 +1688,15 @@ status_report() { echo "flux_source_branch_drift=${flux_branch_drift}" echo "flux_source_ready=${flux_ready:-unknown}" echo "ingress_hosts_count=${ingress_hosts_count}" + if check_mail_safeguards_once 1; then + mail_safeguards_ok=true + else + mail_safeguards_ok=false + fi + echo "mail_startup_safeguards_required=${STARTUP_REQUIRE_MAIL_SAFEGUARDS}" + echo "mail_startup_safeguards_ok=${mail_safeguards_ok}" + echo "mail_startup_host=${MAIL_STARTUP_HOST}" + echo "mail_startup_ports=${MAIL_STARTUP_TCP_PORTS}" echo "harbor_http=${harbor_code:-unknown}" kubectl get ingressclass traefik >/dev/null 2>&1 && echo "traefik_ingressclass=true" || echo "traefik_ingressclass=false" kubectl -n traefik get deploy traefik >/dev/null 2>&1 && echo "traefik_deploy=true" || echo "traefik_deploy=false" @@ -1636,6 +1733,16 @@ planned_shutdown() { fi save_workload_replica_snapshot + if [[ "${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT}" == "1" || "${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT}" == "true" ]]; then + local replica_count + replica_count="$(replica_snapshot_count)" + if [[ ! "${replica_count}" =~ ^[0-9]+$ ]]; then + replica_count=0 + fi + if (( replica_count == 0 )); then + die "Replica snapshot is empty at ${REPLICA_SNAPSHOT_FILE}; refusing shutdown to avoid startup restore deadlock." + fi + fi mark_checkpoint shutdown_replicas_snapshot patch_flux_suspend_all true @@ -1772,6 +1879,7 @@ startup_flow() { resume_flux_and_reconcile wait_for_flux_kustomizations_ready restore_workload_replica_snapshot + restore_zero_scaled_helm_workloads wait_for_startup_workloads_ready wait_for_startup_service_checklist wait_for_startup_stability_window diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index dd02d950..938825b4 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -422,12 +422,14 @@ ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" PLATFORM_TEST_SUCCESS_EVENTS_30D = ( '(sum(increase(ariadne_task_runs_total{status="ok"}[30d])) or on() vector(0)) + ' '(sum(increase(metis_builds_total{status="ok"}[30d])) or on() vector(0)) + ' - '(sum(increase(metis_flashes_total{status="ok"}[30d])) or on() vector(0))' + '(sum(increase(metis_flashes_total{status="ok"}[30d])) or on() vector(0)) + ' + '(sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[30d])) or on() vector(0))' ) PLATFORM_TEST_TOTAL_EVENTS_30D = ( "(sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + " "(sum(increase(metis_builds_total[30d])) or on() vector(0)) + " - "(sum(increase(metis_flashes_total[30d])) or on() vector(0))" + "(sum(increase(metis_flashes_total[30d])) or on() vector(0)) + " + "(sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])) or on() vector(0))" ) TEST_SUCCESS_RATE = ( f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_30D}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_30D}), 1)" @@ -435,12 +437,14 @@ TEST_SUCCESS_RATE = ( TEST_FAILURES_24H_TOTAL = ( '(sum(increase(ariadne_task_runs_total{status!="ok"}[24h])) or on() vector(0)) + ' '(sum(increase(metis_builds_total{status="error"}[24h])) or on() vector(0)) + ' - '(sum(increase(metis_flashes_total{status="error"}[24h])) or on() vector(0))' + '(sum(increase(metis_flashes_total{status="error"}[24h])) or on() vector(0)) + ' + '(sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="failed"}[24h])) or on() vector(0))' ) PLATFORM_TEST_ACTIVITY_30D = ( 'label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), "source", "ariadne", "__name__", ".*") ' 'or label_replace(sum by (status) (increase(metis_builds_total[30d])), "source", "metis-build", "__name__", ".*") ' - 'or label_replace(sum by (status) (increase(metis_flashes_total[30d])), "source", "metis-flash", "__name__", ".*")' + 'or label_replace(sum by (status) (increase(metis_flashes_total[30d])), "source", "metis-flash", "__name__", ".*") ' + 'or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite="ananke"}[30d])), "source", "ananke-quality", "__name__", ".*")' ) PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES = ( 'label_replace(100 * (sum(increase(ariadne_task_runs_total{status="ok"}[$__interval])) or on() vector(0)) ' @@ -451,7 +455,10 @@ PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES = ( '"suite", "metis-build", "__name__", ".*") ' 'or label_replace(100 * (sum(increase(metis_flashes_total{status="ok"}[$__interval])) or on() vector(0)) ' '/ clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), ' - '"suite", "metis-flash", "__name__", ".*")' + '"suite", "metis-flash", "__name__", ".*") ' + 'or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[$__interval])) or on() vector(0)) ' + '/ clamp_min((sum(increase(ananke_quality_gate_runs_total{suite="ananke"}[$__interval])) or on() vector(0)), 1), ' + '"suite", "ananke-quality", "__name__", ".*")' ) ANANKE_SELECTOR = 'job="ananke-power"' ANANKE_UPS_DB_NAME = "Pyrphoros" @@ -1501,7 +1508,7 @@ def build_overview(): links=link_to("atlas-jobs"), ) test_success["description"] = ( - "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines). " + "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). " "Each line tracks pass percentage over time for its suite." ) panels.append(test_success) diff --git a/services/logging/oauth2-proxy.yaml b/services/logging/oauth2-proxy.yaml index 104351a2..d40c3040 100644 --- a/services/logging/oauth2-proxy.yaml +++ b/services/logging/oauth2-proxy.yaml @@ -82,6 +82,7 @@ spec: - --upstream=http://opensearch-dashboards.logging.svc.cluster.local:5601 - --http-address=0.0.0.0:4180 - --skip-provider-button=true + - --approval-prompt=auto - --skip-jwt-bearer-tokens=true - --cookie-domain=logs.bstein.dev env: diff --git a/services/maintenance/oauth2-proxy-metis.yaml b/services/maintenance/oauth2-proxy-metis.yaml index 34207b05..ce0c841b 100644 --- a/services/maintenance/oauth2-proxy-metis.yaml +++ b/services/maintenance/oauth2-proxy-metis.yaml @@ -93,6 +93,7 @@ spec: - --upstream=http://metis.maintenance.svc.cluster.local - --http-address=0.0.0.0:4180 - --skip-provider-button=true + - --approval-prompt=auto - --skip-jwt-bearer-tokens=true - --oidc-groups-claim=groups - --cookie-domain=sentinel.bstein.dev diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 37bf2f4f..45ef7a68 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -1138,7 +1138,7 @@ }, "targets": [ { - "expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0))), 1)", + "expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])) or on() vector(0))), 1)", "refId": "A", "instant": true } @@ -1201,7 +1201,7 @@ }, "targets": [ { - "expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\")", + "expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])), \"source\", \"ananke-quality\", \"__name__\", \".*\")", "refId": "A", "instant": true } @@ -1253,7 +1253,7 @@ "targets": [ { "refId": "A", - "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")", + "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")", "legendFormat": "{{suite}}" } ], diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index de85c5d6..ec4be052 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1820,7 +1820,7 @@ "targets": [ { "refId": "A", - "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")", + "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")", "legendFormat": "{{suite}}" } ], @@ -1846,7 +1846,7 @@ "targetBlank": true } ], - "description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines). Each line tracks pass percentage over time for its suite." + "description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). Each line tracks pass percentage over time for its suite." }, { "id": 47, @@ -1864,7 +1864,7 @@ }, "targets": [ { - "expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0))", + "expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h])) or on() vector(0))", "refId": "A", "instant": true } diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 60f261f2..4ebf068f 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -1147,7 +1147,7 @@ data: }, "targets": [ { - "expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0))), 1)", + "expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])) or on() vector(0))), 1)", "refId": "A", "instant": true } @@ -1210,7 +1210,7 @@ data: }, "targets": [ { - "expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\")", + "expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])), \"source\", \"ananke-quality\", \"__name__\", \".*\")", "refId": "A", "instant": true } @@ -1262,7 +1262,7 @@ data: "targets": [ { "refId": "A", - "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")", + "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")", "legendFormat": "{{suite}}" } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 945c684c..ececba18 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1829,7 +1829,7 @@ data: "targets": [ { "refId": "A", - "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")", + "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")", "legendFormat": "{{suite}}" } ], @@ -1855,7 +1855,7 @@ data: "targetBlank": true } ], - "description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines). Each line tracks pass percentage over time for its suite." + "description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). Each line tracks pass percentage over time for its suite." }, { "id": 47, @@ -1873,7 +1873,7 @@ data: }, "targets": [ { - "expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0))", + "expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h])) or on() vector(0))", "refId": "A", "instant": true }