monitoring/recovery: harden ananke checks and OIDC-gated service validation
This commit is contained in:
parent
e0b124ca4e
commit
764bfe189e
@ -78,6 +78,7 @@ spec:
|
|||||||
- --upstream=http://longhorn-frontend.longhorn-system.svc.cluster.local
|
- --upstream=http://longhorn-frontend.longhorn-system.svc.cluster.local
|
||||||
- --http-address=0.0.0.0:4180
|
- --http-address=0.0.0.0:4180
|
||||||
- --skip-provider-button=true
|
- --skip-provider-button=true
|
||||||
|
- --approval-prompt=auto
|
||||||
- --skip-jwt-bearer-tokens=true
|
- --skip-jwt-bearer-tokens=true
|
||||||
- --oidc-groups-claim=groups
|
- --oidc-groups-claim=groups
|
||||||
- --cookie-domain=longhorn.bstein.dev
|
- --cookie-domain=longhorn.bstein.dev
|
||||||
|
|||||||
@ -127,6 +127,13 @@ STARTUP_INGRESS_ALLOWED_STATUSES="${STARTUP_INGRESS_ALLOWED_STATUSES:-200,301,30
|
|||||||
STARTUP_IGNORE_INGRESS_HOSTS_REGEX="${STARTUP_IGNORE_INGRESS_HOSTS_REGEX:-}"
|
STARTUP_IGNORE_INGRESS_HOSTS_REGEX="${STARTUP_IGNORE_INGRESS_HOSTS_REGEX:-}"
|
||||||
STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="${STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS:-10}"
|
STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="${STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS:-10}"
|
||||||
SHUTDOWN_NAMESPACE_EXCLUDES_REGEX="${SHUTDOWN_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$}"
|
SHUTDOWN_NAMESPACE_EXCLUDES_REGEX="${SHUTDOWN_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$}"
|
||||||
|
REQUIRE_NONEMPTY_REPLICA_SNAPSHOT="${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT:-1}"
|
||||||
|
STARTUP_REQUIRE_MAIL_SAFEGUARDS="${STARTUP_REQUIRE_MAIL_SAFEGUARDS:-1}"
|
||||||
|
MAIL_STARTUP_NAMESPACE="${MAIL_STARTUP_NAMESPACE:-mailu-mailserver}"
|
||||||
|
MAIL_STARTUP_ENDPOINT_SERVICES="${MAIL_STARTUP_ENDPOINT_SERVICES:-mailu-front,mailu-postfix,mailu-dovecot}"
|
||||||
|
MAIL_STARTUP_HOST="${MAIL_STARTUP_HOST:-mail.bstein.dev}"
|
||||||
|
MAIL_STARTUP_TCP_PORTS="${MAIL_STARTUP_TCP_PORTS:-25,465,587,993,995}"
|
||||||
|
MAIL_STARTUP_TCP_TIMEOUT_SECONDS="${MAIL_STARTUP_TCP_TIMEOUT_SECONDS:-3}"
|
||||||
BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}"
|
BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}"
|
||||||
STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/ananke}"
|
STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/ananke}"
|
||||||
RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state"
|
RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state"
|
||||||
@ -656,6 +663,46 @@ service_status_allowed() {
|
|||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
check_mail_safeguards_once() {
|
||||||
|
local quiet="${1:-0}"
|
||||||
|
local failures=0 namespace service host port ips
|
||||||
|
local -a services=() ports=()
|
||||||
|
if [[ "${STARTUP_REQUIRE_MAIL_SAFEGUARDS}" != "1" && "${STARTUP_REQUIRE_MAIL_SAFEGUARDS}" != "true" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
namespace="${MAIL_STARTUP_NAMESPACE}"
|
||||||
|
as_array_from_csv "${MAIL_STARTUP_ENDPOINT_SERVICES}" services
|
||||||
|
for service in "${services[@]}"; do
|
||||||
|
service="${service//[[:space:]]/}"
|
||||||
|
[[ -n "${service}" ]] || continue
|
||||||
|
ips="$(kubectl -n "${namespace}" get endpoints "${service}" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null || true)"
|
||||||
|
if [[ -z "${ips//[[:space:]]/}" ]]; then
|
||||||
|
if [[ "${quiet}" != "1" ]]; then
|
||||||
|
warn "startup-check mail-endpoints ${namespace}/${service}: no ready endpoints."
|
||||||
|
fi
|
||||||
|
failures=1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
host="${MAIL_STARTUP_HOST}"
|
||||||
|
if [[ -n "${host}" ]]; then
|
||||||
|
as_array_from_csv "${MAIL_STARTUP_TCP_PORTS}" ports
|
||||||
|
for port in "${ports[@]}"; do
|
||||||
|
port="${port//[[:space:]]/}"
|
||||||
|
[[ "${port}" =~ ^[0-9]+$ ]] || continue
|
||||||
|
if ! timeout "${MAIL_STARTUP_TCP_TIMEOUT_SECONDS}" bash -lc "</dev/tcp/${host}/${port}" >/dev/null 2>&1; then
|
||||||
|
if [[ "${quiet}" != "1" ]]; then
|
||||||
|
warn "startup-check mail-tcp ${host}:${port}: connect failed."
|
||||||
|
fi
|
||||||
|
failures=1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
(( failures == 0 ))
|
||||||
|
}
|
||||||
|
|
||||||
check_startup_service_checklist_once() {
|
check_startup_service_checklist_once() {
|
||||||
local rows row name url expected body_must body_must_not insecure timeout code rc
|
local rows row name url expected body_must body_must_not insecure timeout code rc
|
||||||
local body_file failures
|
local body_file failures
|
||||||
@ -700,6 +747,9 @@ check_startup_service_checklist_once() {
|
|||||||
fi
|
fi
|
||||||
rm -f "${body_file}"
|
rm -f "${body_file}"
|
||||||
done <<< "${rows}"
|
done <<< "${rows}"
|
||||||
|
if ! check_mail_safeguards_once; then
|
||||||
|
failures=1
|
||||||
|
fi
|
||||||
(( failures == 0 ))
|
(( failures == 0 ))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -887,7 +937,15 @@ save_workload_replica_snapshot() {
|
|||||||
printf '%s\t%s\t%s\t%s\n' "${ns}" "${kind}" "${name}" "${replicas}" >> "${REPLICA_SNAPSHOT_FILE}"
|
printf '%s\t%s\t%s\t%s\n' "${ns}" "${kind}" "${name}" "${replicas}" >> "${REPLICA_SNAPSHOT_FILE}"
|
||||||
done <<< "${rows}"
|
done <<< "${rows}"
|
||||||
log "replica-snapshot-file=${REPLICA_SNAPSHOT_FILE}"
|
log "replica-snapshot-file=${REPLICA_SNAPSHOT_FILE}"
|
||||||
log "replica-snapshot-count=$(wc -l < "${REPLICA_SNAPSHOT_FILE}" | tr -d ' ')"
|
log "replica-snapshot-count=$(replica_snapshot_count)"
|
||||||
|
}
|
||||||
|
|
||||||
|
replica_snapshot_count() {
|
||||||
|
if [[ -f "${REPLICA_SNAPSHOT_FILE}" ]]; then
|
||||||
|
wc -l < "${REPLICA_SNAPSHOT_FILE}" | tr -d ' '
|
||||||
|
else
|
||||||
|
printf '0'
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
restore_workload_replica_snapshot() {
|
restore_workload_replica_snapshot() {
|
||||||
@ -915,6 +973,35 @@ restore_workload_replica_snapshot() {
|
|||||||
mark_checkpoint startup_replicas_restored
|
mark_checkpoint startup_replicas_restored
|
||||||
}
|
}
|
||||||
|
|
||||||
|
restore_zero_scaled_helm_workloads() {
|
||||||
|
local rows ns kind name
|
||||||
|
local restored=0
|
||||||
|
rows="$(
|
||||||
|
{
|
||||||
|
kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\.helm\\.sh/release-name --no-headers 2>/dev/null \
|
||||||
|
| awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "<none>" {printf "%s\tdeployment\t%s\n", $1, $2}'
|
||||||
|
kubectl get statefulset -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\.helm\\.sh/release-name --no-headers 2>/dev/null \
|
||||||
|
| awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "<none>" {printf "%s\tstatefulset\t%s\n", $1, $2}'
|
||||||
|
} | sed '/^[[:space:]]*$/d'
|
||||||
|
)"
|
||||||
|
while IFS=$'\t' read -r ns kind name; do
|
||||||
|
[[ -n "${ns}" && -n "${kind}" && -n "${name}" ]] || continue
|
||||||
|
startup_workload_namespace_excluded "${ns}" && continue
|
||||||
|
if [[ -n "${STARTUP_IGNORE_WORKLOADS_REGEX}" ]] && [[ "${ns}/${name}" =~ ${STARTUP_IGNORE_WORKLOADS_REGEX} ]]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
warn "Auto-heal: restoring zero-scaled Helm workload ${ns}/${kind}/${name} to replicas=1."
|
||||||
|
run kubectl -n "${ns}" scale "${kind}" "${name}" --replicas=1
|
||||||
|
restored=$((restored + 1))
|
||||||
|
done <<< "${rows}"
|
||||||
|
if (( restored > 0 )); then
|
||||||
|
log "Auto-heal: restored ${restored} zero-scaled Helm workloads."
|
||||||
|
mark_checkpoint startup_zero_scaled_helm_restored
|
||||||
|
else
|
||||||
|
log "Auto-heal: no zero-scaled Helm workloads detected."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
list_unhealthy_workloads() {
|
list_unhealthy_workloads() {
|
||||||
local rows line ns name desired ready available
|
local rows line ns name desired ready available
|
||||||
rows="$(kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas --no-headers 2>/dev/null || true)"
|
rows="$(kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas --no-headers 2>/dev/null || true)"
|
||||||
@ -1548,6 +1635,7 @@ resume_flux_and_reconcile() {
|
|||||||
|
|
||||||
status_report() {
|
status_report() {
|
||||||
local battery flux_ready flux_url flux_branch flux_url_drift flux_branch_drift harbor_code workers ingress_hosts_count
|
local battery flux_ready flux_url flux_branch flux_url_drift flux_branch_drift harbor_code workers ingress_hosts_count
|
||||||
|
local mail_safeguards_ok
|
||||||
local effective_target effective_canary
|
local effective_target effective_canary
|
||||||
local labeled_nodes
|
local labeled_nodes
|
||||||
battery="$(read_ups_battery || true)"
|
battery="$(read_ups_battery || true)"
|
||||||
@ -1600,6 +1688,15 @@ status_report() {
|
|||||||
echo "flux_source_branch_drift=${flux_branch_drift}"
|
echo "flux_source_branch_drift=${flux_branch_drift}"
|
||||||
echo "flux_source_ready=${flux_ready:-unknown}"
|
echo "flux_source_ready=${flux_ready:-unknown}"
|
||||||
echo "ingress_hosts_count=${ingress_hosts_count}"
|
echo "ingress_hosts_count=${ingress_hosts_count}"
|
||||||
|
if check_mail_safeguards_once 1; then
|
||||||
|
mail_safeguards_ok=true
|
||||||
|
else
|
||||||
|
mail_safeguards_ok=false
|
||||||
|
fi
|
||||||
|
echo "mail_startup_safeguards_required=${STARTUP_REQUIRE_MAIL_SAFEGUARDS}"
|
||||||
|
echo "mail_startup_safeguards_ok=${mail_safeguards_ok}"
|
||||||
|
echo "mail_startup_host=${MAIL_STARTUP_HOST}"
|
||||||
|
echo "mail_startup_ports=${MAIL_STARTUP_TCP_PORTS}"
|
||||||
echo "harbor_http=${harbor_code:-unknown}"
|
echo "harbor_http=${harbor_code:-unknown}"
|
||||||
kubectl get ingressclass traefik >/dev/null 2>&1 && echo "traefik_ingressclass=true" || echo "traefik_ingressclass=false"
|
kubectl get ingressclass traefik >/dev/null 2>&1 && echo "traefik_ingressclass=true" || echo "traefik_ingressclass=false"
|
||||||
kubectl -n traefik get deploy traefik >/dev/null 2>&1 && echo "traefik_deploy=true" || echo "traefik_deploy=false"
|
kubectl -n traefik get deploy traefik >/dev/null 2>&1 && echo "traefik_deploy=true" || echo "traefik_deploy=false"
|
||||||
@ -1636,6 +1733,16 @@ planned_shutdown() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
save_workload_replica_snapshot
|
save_workload_replica_snapshot
|
||||||
|
if [[ "${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT}" == "1" || "${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT}" == "true" ]]; then
|
||||||
|
local replica_count
|
||||||
|
replica_count="$(replica_snapshot_count)"
|
||||||
|
if [[ ! "${replica_count}" =~ ^[0-9]+$ ]]; then
|
||||||
|
replica_count=0
|
||||||
|
fi
|
||||||
|
if (( replica_count == 0 )); then
|
||||||
|
die "Replica snapshot is empty at ${REPLICA_SNAPSHOT_FILE}; refusing shutdown to avoid startup restore deadlock."
|
||||||
|
fi
|
||||||
|
fi
|
||||||
mark_checkpoint shutdown_replicas_snapshot
|
mark_checkpoint shutdown_replicas_snapshot
|
||||||
|
|
||||||
patch_flux_suspend_all true
|
patch_flux_suspend_all true
|
||||||
@ -1772,6 +1879,7 @@ startup_flow() {
|
|||||||
resume_flux_and_reconcile
|
resume_flux_and_reconcile
|
||||||
wait_for_flux_kustomizations_ready
|
wait_for_flux_kustomizations_ready
|
||||||
restore_workload_replica_snapshot
|
restore_workload_replica_snapshot
|
||||||
|
restore_zero_scaled_helm_workloads
|
||||||
wait_for_startup_workloads_ready
|
wait_for_startup_workloads_ready
|
||||||
wait_for_startup_service_checklist
|
wait_for_startup_service_checklist
|
||||||
wait_for_startup_stability_window
|
wait_for_startup_stability_window
|
||||||
|
|||||||
@ -422,12 +422,14 @@ ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
|
|||||||
PLATFORM_TEST_SUCCESS_EVENTS_30D = (
|
PLATFORM_TEST_SUCCESS_EVENTS_30D = (
|
||||||
'(sum(increase(ariadne_task_runs_total{status="ok"}[30d])) or on() vector(0)) + '
|
'(sum(increase(ariadne_task_runs_total{status="ok"}[30d])) or on() vector(0)) + '
|
||||||
'(sum(increase(metis_builds_total{status="ok"}[30d])) or on() vector(0)) + '
|
'(sum(increase(metis_builds_total{status="ok"}[30d])) or on() vector(0)) + '
|
||||||
'(sum(increase(metis_flashes_total{status="ok"}[30d])) or on() vector(0))'
|
'(sum(increase(metis_flashes_total{status="ok"}[30d])) or on() vector(0)) + '
|
||||||
|
'(sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[30d])) or on() vector(0))'
|
||||||
)
|
)
|
||||||
PLATFORM_TEST_TOTAL_EVENTS_30D = (
|
PLATFORM_TEST_TOTAL_EVENTS_30D = (
|
||||||
"(sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + "
|
"(sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + "
|
||||||
"(sum(increase(metis_builds_total[30d])) or on() vector(0)) + "
|
"(sum(increase(metis_builds_total[30d])) or on() vector(0)) + "
|
||||||
"(sum(increase(metis_flashes_total[30d])) or on() vector(0))"
|
"(sum(increase(metis_flashes_total[30d])) or on() vector(0)) + "
|
||||||
|
"(sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])) or on() vector(0))"
|
||||||
)
|
)
|
||||||
TEST_SUCCESS_RATE = (
|
TEST_SUCCESS_RATE = (
|
||||||
f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_30D}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_30D}), 1)"
|
f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_30D}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_30D}), 1)"
|
||||||
@ -435,12 +437,14 @@ TEST_SUCCESS_RATE = (
|
|||||||
TEST_FAILURES_24H_TOTAL = (
|
TEST_FAILURES_24H_TOTAL = (
|
||||||
'(sum(increase(ariadne_task_runs_total{status!="ok"}[24h])) or on() vector(0)) + '
|
'(sum(increase(ariadne_task_runs_total{status!="ok"}[24h])) or on() vector(0)) + '
|
||||||
'(sum(increase(metis_builds_total{status="error"}[24h])) or on() vector(0)) + '
|
'(sum(increase(metis_builds_total{status="error"}[24h])) or on() vector(0)) + '
|
||||||
'(sum(increase(metis_flashes_total{status="error"}[24h])) or on() vector(0))'
|
'(sum(increase(metis_flashes_total{status="error"}[24h])) or on() vector(0)) + '
|
||||||
|
'(sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="failed"}[24h])) or on() vector(0))'
|
||||||
)
|
)
|
||||||
PLATFORM_TEST_ACTIVITY_30D = (
|
PLATFORM_TEST_ACTIVITY_30D = (
|
||||||
'label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), "source", "ariadne", "__name__", ".*") '
|
'label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), "source", "ariadne", "__name__", ".*") '
|
||||||
'or label_replace(sum by (status) (increase(metis_builds_total[30d])), "source", "metis-build", "__name__", ".*") '
|
'or label_replace(sum by (status) (increase(metis_builds_total[30d])), "source", "metis-build", "__name__", ".*") '
|
||||||
'or label_replace(sum by (status) (increase(metis_flashes_total[30d])), "source", "metis-flash", "__name__", ".*")'
|
'or label_replace(sum by (status) (increase(metis_flashes_total[30d])), "source", "metis-flash", "__name__", ".*") '
|
||||||
|
'or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite="ananke"}[30d])), "source", "ananke-quality", "__name__", ".*")'
|
||||||
)
|
)
|
||||||
PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES = (
|
PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES = (
|
||||||
'label_replace(100 * (sum(increase(ariadne_task_runs_total{status="ok"}[$__interval])) or on() vector(0)) '
|
'label_replace(100 * (sum(increase(ariadne_task_runs_total{status="ok"}[$__interval])) or on() vector(0)) '
|
||||||
@ -451,7 +455,10 @@ PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES = (
|
|||||||
'"suite", "metis-build", "__name__", ".*") '
|
'"suite", "metis-build", "__name__", ".*") '
|
||||||
'or label_replace(100 * (sum(increase(metis_flashes_total{status="ok"}[$__interval])) or on() vector(0)) '
|
'or label_replace(100 * (sum(increase(metis_flashes_total{status="ok"}[$__interval])) or on() vector(0)) '
|
||||||
'/ clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), '
|
'/ clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), '
|
||||||
'"suite", "metis-flash", "__name__", ".*")'
|
'"suite", "metis-flash", "__name__", ".*") '
|
||||||
|
'or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[$__interval])) or on() vector(0)) '
|
||||||
|
'/ clamp_min((sum(increase(ananke_quality_gate_runs_total{suite="ananke"}[$__interval])) or on() vector(0)), 1), '
|
||||||
|
'"suite", "ananke-quality", "__name__", ".*")'
|
||||||
)
|
)
|
||||||
ANANKE_SELECTOR = 'job="ananke-power"'
|
ANANKE_SELECTOR = 'job="ananke-power"'
|
||||||
ANANKE_UPS_DB_NAME = "Pyrphoros"
|
ANANKE_UPS_DB_NAME = "Pyrphoros"
|
||||||
@ -1501,7 +1508,7 @@ def build_overview():
|
|||||||
links=link_to("atlas-jobs"),
|
links=link_to("atlas-jobs"),
|
||||||
)
|
)
|
||||||
test_success["description"] = (
|
test_success["description"] = (
|
||||||
"Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines). "
|
"Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). "
|
||||||
"Each line tracks pass percentage over time for its suite."
|
"Each line tracks pass percentage over time for its suite."
|
||||||
)
|
)
|
||||||
panels.append(test_success)
|
panels.append(test_success)
|
||||||
|
|||||||
@ -82,6 +82,7 @@ spec:
|
|||||||
- --upstream=http://opensearch-dashboards.logging.svc.cluster.local:5601
|
- --upstream=http://opensearch-dashboards.logging.svc.cluster.local:5601
|
||||||
- --http-address=0.0.0.0:4180
|
- --http-address=0.0.0.0:4180
|
||||||
- --skip-provider-button=true
|
- --skip-provider-button=true
|
||||||
|
- --approval-prompt=auto
|
||||||
- --skip-jwt-bearer-tokens=true
|
- --skip-jwt-bearer-tokens=true
|
||||||
- --cookie-domain=logs.bstein.dev
|
- --cookie-domain=logs.bstein.dev
|
||||||
env:
|
env:
|
||||||
|
|||||||
@ -93,6 +93,7 @@ spec:
|
|||||||
- --upstream=http://metis.maintenance.svc.cluster.local
|
- --upstream=http://metis.maintenance.svc.cluster.local
|
||||||
- --http-address=0.0.0.0:4180
|
- --http-address=0.0.0.0:4180
|
||||||
- --skip-provider-button=true
|
- --skip-provider-button=true
|
||||||
|
- --approval-prompt=auto
|
||||||
- --skip-jwt-bearer-tokens=true
|
- --skip-jwt-bearer-tokens=true
|
||||||
- --oidc-groups-claim=groups
|
- --oidc-groups-claim=groups
|
||||||
- --cookie-domain=sentinel.bstein.dev
|
- --cookie-domain=sentinel.bstein.dev
|
||||||
|
|||||||
@ -1138,7 +1138,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0))), 1)",
|
"expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])) or on() vector(0))), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -1201,7 +1201,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\")",
|
"expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])), \"source\", \"ananke-quality\", \"__name__\", \".*\")",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -1253,7 +1253,7 @@
|
|||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")",
|
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")",
|
||||||
"legendFormat": "{{suite}}"
|
"legendFormat": "{{suite}}"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@ -1820,7 +1820,7 @@
|
|||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")",
|
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")",
|
||||||
"legendFormat": "{{suite}}"
|
"legendFormat": "{{suite}}"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -1846,7 +1846,7 @@
|
|||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines). Each line tracks pass percentage over time for its suite."
|
"description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). Each line tracks pass percentage over time for its suite."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 47,
|
"id": 47,
|
||||||
@ -1864,7 +1864,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0))",
|
"expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h])) or on() vector(0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1147,7 +1147,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0))), 1)",
|
"expr": "100 * ((sum(increase(ariadne_task_runs_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"ok\"}[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(ariadne_task_runs_total[30d])) or on() vector(0)) + (sum(increase(metis_builds_total[30d])) or on() vector(0)) + (sum(increase(metis_flashes_total[30d])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])) or on() vector(0))), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -1210,7 +1210,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\")",
|
"expr": "label_replace(sum by (status) (increase(ariadne_task_runs_total[30d])), \"source\", \"ariadne\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_builds_total[30d])), \"source\", \"metis-build\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(metis_flashes_total[30d])), \"source\", \"metis-flash\", \"__name__\", \".*\") or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[30d])), \"source\", \"ananke-quality\", \"__name__\", \".*\")",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -1262,7 +1262,7 @@ data:
|
|||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")",
|
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")",
|
||||||
"legendFormat": "{{suite}}"
|
"legendFormat": "{{suite}}"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@ -1829,7 +1829,7 @@ data:
|
|||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\")",
|
"expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")",
|
||||||
"legendFormat": "{{suite}}"
|
"legendFormat": "{{suite}}"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -1855,7 +1855,7 @@ data:
|
|||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines). Each line tracks pass percentage over time for its suite."
|
"description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). Each line tracks pass percentage over time for its suite."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 47,
|
"id": 47,
|
||||||
@ -1873,7 +1873,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0))",
|
"expr": "(sum(increase(ariadne_task_runs_total{status!=\"ok\"}[24h])) or on() vector(0)) + (sum(increase(metis_builds_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(metis_flashes_total{status=\"error\"}[24h])) or on() vector(0)) + (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"failed\"}[24h])) or on() vector(0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user