monitoring/recovery: harden ananke checks and OIDC-gated service validation
This commit is contained in:
parent
e0b124ca4e
commit
764bfe189e
@ -78,6 +78,7 @@ spec:
|
||||
- --upstream=http://longhorn-frontend.longhorn-system.svc.cluster.local
|
||||
- --http-address=0.0.0.0:4180
|
||||
- --skip-provider-button=true
|
||||
- --approval-prompt=auto
|
||||
- --skip-jwt-bearer-tokens=true
|
||||
- --oidc-groups-claim=groups
|
||||
- --cookie-domain=longhorn.bstein.dev
|
||||
|
||||
@ -127,6 +127,13 @@ STARTUP_INGRESS_ALLOWED_STATUSES="${STARTUP_INGRESS_ALLOWED_STATUSES:-200,301,30
|
||||
STARTUP_IGNORE_INGRESS_HOSTS_REGEX="${STARTUP_IGNORE_INGRESS_HOSTS_REGEX:-}"
|
||||
STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="${STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS:-10}"
|
||||
SHUTDOWN_NAMESPACE_EXCLUDES_REGEX="${SHUTDOWN_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$}"
|
||||
REQUIRE_NONEMPTY_REPLICA_SNAPSHOT="${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT:-1}"
|
||||
STARTUP_REQUIRE_MAIL_SAFEGUARDS="${STARTUP_REQUIRE_MAIL_SAFEGUARDS:-1}"
|
||||
MAIL_STARTUP_NAMESPACE="${MAIL_STARTUP_NAMESPACE:-mailu-mailserver}"
|
||||
MAIL_STARTUP_ENDPOINT_SERVICES="${MAIL_STARTUP_ENDPOINT_SERVICES:-mailu-front,mailu-postfix,mailu-dovecot}"
|
||||
MAIL_STARTUP_HOST="${MAIL_STARTUP_HOST:-mail.bstein.dev}"
|
||||
MAIL_STARTUP_TCP_PORTS="${MAIL_STARTUP_TCP_PORTS:-25,465,587,993,995}"
|
||||
MAIL_STARTUP_TCP_TIMEOUT_SECONDS="${MAIL_STARTUP_TCP_TIMEOUT_SECONDS:-3}"
|
||||
BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}"
|
||||
STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/ananke}"
|
||||
RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state"
|
||||
@ -656,6 +663,46 @@ service_status_allowed() {
|
||||
return 1
|
||||
}
|
||||
|
||||
check_mail_safeguards_once() {
|
||||
local quiet="${1:-0}"
|
||||
local failures=0 namespace service host port ips
|
||||
local -a services=() ports=()
|
||||
if [[ "${STARTUP_REQUIRE_MAIL_SAFEGUARDS}" != "1" && "${STARTUP_REQUIRE_MAIL_SAFEGUARDS}" != "true" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
namespace="${MAIL_STARTUP_NAMESPACE}"
|
||||
as_array_from_csv "${MAIL_STARTUP_ENDPOINT_SERVICES}" services
|
||||
for service in "${services[@]}"; do
|
||||
service="${service//[[:space:]]/}"
|
||||
[[ -n "${service}" ]] || continue
|
||||
ips="$(kubectl -n "${namespace}" get endpoints "${service}" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null || true)"
|
||||
if [[ -z "${ips//[[:space:]]/}" ]]; then
|
||||
if [[ "${quiet}" != "1" ]]; then
|
||||
warn "startup-check mail-endpoints ${namespace}/${service}: no ready endpoints."
|
||||
fi
|
||||
failures=1
|
||||
fi
|
||||
done
|
||||
|
||||
host="${MAIL_STARTUP_HOST}"
|
||||
if [[ -n "${host}" ]]; then
|
||||
as_array_from_csv "${MAIL_STARTUP_TCP_PORTS}" ports
|
||||
for port in "${ports[@]}"; do
|
||||
port="${port//[[:space:]]/}"
|
||||
[[ "${port}" =~ ^[0-9]+$ ]] || continue
|
||||
if ! timeout "${MAIL_STARTUP_TCP_TIMEOUT_SECONDS}" bash -lc "</dev/tcp/${host}/${port}" >/dev/null 2>&1; then
|
||||
if [[ "${quiet}" != "1" ]]; then
|
||||
warn "startup-check mail-tcp ${host}:${port}: connect failed."
|
||||
fi
|
||||
failures=1
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
(( failures == 0 ))
|
||||
}
|
||||
|
||||
check_startup_service_checklist_once() {
|
||||
local rows row name url expected body_must body_must_not insecure timeout code rc
|
||||
local body_file failures
|
||||
@ -700,6 +747,9 @@ check_startup_service_checklist_once() {
|
||||
fi
|
||||
rm -f "${body_file}"
|
||||
done <<< "${rows}"
|
||||
if ! check_mail_safeguards_once; then
|
||||
failures=1
|
||||
fi
|
||||
(( failures == 0 ))
|
||||
}
|
||||
|
||||
@ -887,7 +937,15 @@ save_workload_replica_snapshot() {
|
||||
printf '%s\t%s\t%s\t%s\n' "${ns}" "${kind}" "${name}" "${replicas}" >> "${REPLICA_SNAPSHOT_FILE}"
|
||||
done <<< "${rows}"
|
||||
log "replica-snapshot-file=${REPLICA_SNAPSHOT_FILE}"
|
||||
log "replica-snapshot-count=$(wc -l < "${REPLICA_SNAPSHOT_FILE}" | tr -d ' ')"
|
||||
log "replica-snapshot-count=$(replica_snapshot_count)"
|
||||
}
|
||||
|
||||
replica_snapshot_count() {
|
||||
if [[ -f "${REPLICA_SNAPSHOT_FILE}" ]]; then
|
||||
wc -l < "${REPLICA_SNAPSHOT_FILE}" | tr -d ' '
|
||||
else
|
||||
printf '0'
|
||||
fi
|
||||
}
|
||||
|
||||
restore_workload_replica_snapshot() {
|
||||
@ -915,6 +973,35 @@ restore_workload_replica_snapshot() {
|
||||
mark_checkpoint startup_replicas_restored
|
||||
}
|
||||
|
||||
restore_zero_scaled_helm_workloads() {
|
||||
local rows ns kind name
|
||||
local restored=0
|
||||
rows="$(
|
||||
{
|
||||
kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\.helm\\.sh/release-name --no-headers 2>/dev/null \
|
||||
| awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "<none>" {printf "%s\tdeployment\t%s\n", $1, $2}'
|
||||
kubectl get statefulset -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\.helm\\.sh/release-name --no-headers 2>/dev/null \
|
||||
| awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "<none>" {printf "%s\tstatefulset\t%s\n", $1, $2}'
|
||||
} | sed '/^[[:space:]]*$/d'
|
||||
)"
|
||||
while IFS=$'\t' read -r ns kind name; do
|
||||
[[ -n "${ns}" && -n "${kind}" && -n "${name}" ]] || continue
|
||||
startup_workload_namespace_excluded "${ns}" && continue
|
||||
if [[ -n "${STARTUP_IGNORE_WORKLOADS_REGEX}" ]] && [[ "${ns}/${name}" =~ ${STARTUP_IGNORE_WORKLOADS_REGEX} ]]; then
|
||||
continue
|
||||
fi
|
||||
warn "Auto-heal: restoring zero-scaled Helm workload ${ns}/${kind}/${name} to replicas=1."
|
||||
run kubectl -n "${ns}" scale "${kind}" "${name}" --replicas=1
|
||||
restored=$((restored + 1))
|
||||
done <<< "${rows}"
|
||||
if (( restored > 0 )); then
|
||||
log "Auto-heal: restored ${restored} zero-scaled Helm workloads."
|
||||
mark_checkpoint startup_zero_scaled_helm_restored
|
||||
else
|
||||
log "Auto-heal: no zero-scaled Helm workloads detected."
|
||||
fi
|
||||
}
|
||||
|
||||
list_unhealthy_workloads() {
|
||||
local rows line ns name desired ready available
|
||||
rows="$(kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas --no-headers 2>/dev/null || true)"
|
||||
@ -1548,6 +1635,7 @@ resume_flux_and_reconcile() {
|
||||
|
||||
status_report() {
|
||||
local battery flux_ready flux_url flux_branch flux_url_drift flux_branch_drift harbor_code workers ingress_hosts_count
|
||||
local mail_safeguards_ok
|
||||
local effective_target effective_canary
|
||||
local labeled_nodes
|
||||
battery="$(read_ups_battery || true)"
|
||||
@ -1600,6 +1688,15 @@ status_report() {
|
||||
echo "flux_source_branch_drift=${flux_branch_drift}"
|
||||
echo "flux_source_ready=${flux_ready:-unknown}"
|
||||
echo "ingress_hosts_count=${ingress_hosts_count}"
|
||||
if check_mail_safeguards_once 1; then
|
||||
mail_safeguards_ok=true
|
||||
else
|
||||
mail_safeguards_ok=false
|
||||
fi
|
||||
echo "mail_startup_safeguards_required=${STARTUP_REQUIRE_MAIL_SAFEGUARDS}"
|
||||
echo "mail_startup_safeguards_ok=${mail_safeguards_ok}"
|
||||
echo "mail_startup_host=${MAIL_STARTUP_HOST}"
|
||||
echo "mail_startup_ports=${MAIL_STARTUP_TCP_PORTS}"
|
||||
echo "harbor_http=${harbor_code:-unknown}"
|
||||
kubectl get ingressclass traefik >/dev/null 2>&1 && echo "traefik_ingressclass=true" || echo "traefik_ingressclass=false"
|
||||
kubectl -n traefik get deploy traefik >/dev/null 2>&1 && echo "traefik_deploy=true" || echo "traefik_deploy=false"
|
||||
@ -1636,6 +1733,16 @@ planned_shutdown() {
|
||||
fi
|
||||
|
||||
save_workload_replica_snapshot
|
||||
if [[ "${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT}" == "1" || "${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT}" == "true" ]]; then
|
||||
local replica_count
|
||||
replica_count="$(replica_snapshot_count)"
|
||||
if [[ ! "${replica_count}" =~ ^[0-9]+$ ]]; then
|
||||
replica_count=0
|
||||
fi
|
||||
if (( replica_count == 0 )); then
|
||||
die "Replica snapshot is empty at ${REPLICA_SNAPSHOT_FILE}; refusing shutdown to avoid startup restore deadlock."
|
||||
fi
|
||||
fi
|
||||
mark_checkpoint shutdown_replicas_snapshot
|
||||
|
||||
patch_flux_suspend_all true
|
||||
@ -1772,6 +1879,7 @@ startup_flow() {
|
||||
resume_flux_and_reconcile
|
||||
wait_for_flux_kustomizations_ready
|
||||
restore_workload_replica_snapshot
|
||||
restore_zero_scaled_helm_workloads
|
||||
wait_for_startup_workloads_ready
|
||||
wait_for_startup_service_checklist
|
||||
wait_for_startup_stability_window
|
||||
|
||||
@ -422,12 +422,14 @@ ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
|
||||
PLATFORM_TEST_SUCCESS_EVENTS_30D = (
|
||||
'(sum(increase(ariadne_task_runs_total{status="ok"}[30d])) or on() vector(0)) + '
|
||||
'(sum(increase(metis_builds_total{status="ok"}[30d])) or on() vector(0)) + '
|
||||
'(sum(increase(metis_flashes_total{status="ok"}[30d])) or on() vector(0))'
|
||||
'(sum(increase(metis_flashes_total{status="ok"}[30d])) or on() vector(0)) + '
|
||||
'(sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[30d])) or on() vector(0))'
|
||||
)
|
||||
PLATFORM_TEST_TOTAL_EVENTS_30D = (
|
||||
"(sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + "
|
||||
"(sum(increase(metis_builds_total[30d])) or on() vector(0)) + "
|
||||
"(sum(increase(metis_flashes_total[30d])) or on() vector(0))"
|
||||
"(sum(increase(metis_flashes_total[30d])) or on() vector(0)) + "
|
||||
"(sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])) or on() vector(0))"
|
||||
)
|
||||
TEST_SUCCESS_RATE = (
|
||||
f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_30D}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_30D}), 1)"
|
||||
@ -435,12 +437,14 @@ TEST_SUCCESS_RATE = (
|
||||
TEST_FAILURES_24H_TOTAL = (
|
||||
'(sum(increase(ariadne_task_runs_total{status!="ok"}[24h])) or on() vector(0)) + '
|
||||
'(sum(increase(metis_builds_total{status="error"}[24h])) or on() vector(0)) + '
|
||||
'(sum(increase(metis_flashes_total{status="error"}[24h])) or on() vector(0))'
|
||||
'(sum(increase(metis_flashes_total{status="error"}[24h])) or on() vector(0)) + '
|
||||
'(sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="failed"}[24h])) or on() vector(0))'
|
||||
)
|
||||
PLATFORM_TEST_ACTIVITY_30D = (
|
||||
'label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), "source", "ariadne", "__name__", ".*") '
|
||||
'or label_replace(sum by (status) (increase(metis_builds_total[30d])), "source", "metis-build", "__name__", ".*") '
|
||||
'or label_replace(sum by (status) (increase(metis_flashes_total[30d])), "source", "metis-flash", "__name__", ".*")'
|
||||
'or label_replace(sum by (status) (increase(metis_flashes_total[30d])), "source", "metis-flash", "__name__", ".*") '
|
||||
'or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite="ananke"}[30d])), "source", "ananke-quality", "__name__", ".*")'
|
||||
)
|
||||
PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES = (
|
||||
'label_replace(100 * (sum(increase(ariadne_task_runs_total{status="ok"}[$__interval])) or on() vector(0)) '
|
||||
@ -451,7 +455,10 @@ PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES = (
|
||||
'"suite", "metis-build", "__name__", ".*") '
|
||||
'or label_replace(100 * (sum(increase(metis_flashes_total{status="ok"}[$__interval])) or on() vector(0)) '
|
||||
'/ clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), '
|
||||
'"suite", "metis-flash", "__name__", ".*")'
|
||||
'"suite", "metis-flash", "__name__", ".*") '
|
||||
'or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[$__interval])) or on() vector(0)) '
|
||||
'/ clamp_min((sum(increase(ananke_quality_gate_runs_total{suite="ananke"}[$__interval])) or on() vector(0)), 1), '
|
||||
'"suite", "ananke-quality", "__name__", ".*")'
|
||||
)
|
||||
ANANKE_SELECTOR = 'job="ananke-power"'
|
||||
ANANKE_UPS_DB_NAME = "Pyrphoros"
|
||||
@ -1501,7 +1508,7 @@ def build_overview():
|
||||
links=link_to("atlas-jobs"),
|
||||
)
|
||||
test_success["description"] = (
|
||||
"Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines). "
|
||||
"Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). "
|
||||
"Each line tracks pass percentage over time for its suite."
|
||||
)
|
||||
panels.append(test_success)
|
||||
|
||||
@ -82,6 +82,7 @@ spec:
|
||||
- --upstream=http://opensearch-dashboards.logging.svc.cluster.local:5601
|
||||
- --http-address=0.0.0.0:4180
|
||||
- --skip-provider-button=true
|
||||
- --approval-prompt=auto
|
||||
- --skip-jwt-bearer-tokens=true
|
||||
- --cookie-domain=logs.bstein.dev
|
||||
env:
|
||||
|
||||
@ -93,6 +93,7 @@ spec:
|
||||
- --upstream=http://metis.maintenance.svc.cluster.local
|
||||
- --http-address=0.0.0.0:4180
|
||||
- --skip-provider-button=true
|
||||
- --approval-prompt=auto
|
||||
- --skip-jwt-bearer-tokens=true
|
||||
- --oidc-groups-claim=groups
|
||||
- --cookie-domain=sentinel.bstein.dev
|
||||
|
||||
@ -1138,7 +1138,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0))), 1)",
|
||||
"expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])) or on() vector(0))), 1)",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -1201,7 +1201,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\")",
|
||||
"expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])), \"source\", \"ananke-quality\", \"__name__\", \".*\")",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -1253,7 +1253,7 @@
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")",
|
||||
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")",
|
||||
"legendFormat": "{{suite}}"
|
||||
}
|
||||
],
|
||||
|
||||
@ -1820,7 +1820,7 @@
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")",
|
||||
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")",
|
||||
"legendFormat": "{{suite}}"
|
||||
}
|
||||
],
|
||||
@ -1846,7 +1846,7 @@
|
||||
"targetBlank": true
|
||||
}
|
||||
],
|
||||
"description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines). Each line tracks pass percentage over time for its suite."
|
||||
"description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). Each line tracks pass percentage over time for its suite."
|
||||
},
|
||||
{
|
||||
"id": 47,
|
||||
@ -1864,7 +1864,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0))",
|
||||
"expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h])) or on() vector(0))",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
|
||||
@ -1147,7 +1147,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0))), 1)",
|
||||
"expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])) or on() vector(0))), 1)",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -1210,7 +1210,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\")",
|
||||
"expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])), \"source\", \"ananke-quality\", \"__name__\", \".*\")",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -1262,7 +1262,7 @@ data:
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")",
|
||||
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")",
|
||||
"legendFormat": "{{suite}}"
|
||||
}
|
||||
],
|
||||
|
||||
@ -1829,7 +1829,7 @@ data:
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")",
|
||||
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")",
|
||||
"legendFormat": "{{suite}}"
|
||||
}
|
||||
],
|
||||
@ -1855,7 +1855,7 @@ data:
|
||||
"targetBlank": true
|
||||
}
|
||||
],
|
||||
"description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines). Each line tracks pass percentage over time for its suite."
|
||||
"description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). Each line tracks pass percentage over time for its suite."
|
||||
},
|
||||
{
|
||||
"id": 47,
|
||||
@ -1873,7 +1873,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0))",
|
||||
"expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h])) or on() vector(0))",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user