maintenance(soteria): enable b2 usage scan config and alert

This commit is contained in:
Brad Stein 2026-04-12 19:47:58 -03:00
parent 609cfcb696
commit a01dc0813a
2 changed files with 56 additions and 0 deletions

View File

@ -12,3 +12,11 @@ data:
SOTERIA_ALLOWED_GROUPS: admin,maintenance
SOTERIA_BACKUP_MAX_AGE_HOURS: "24"
SOTERIA_METRICS_REFRESH_SECONDS: "300"
SOTERIA_B2_ENABLED: "true"
SOTERIA_B2_SECRET_NAMESPACE: maintenance
SOTERIA_B2_SECRET_NAME: soteria-restic
SOTERIA_B2_ACCESS_KEY_FIELD: AWS_ACCESS_KEY_ID
SOTERIA_B2_SECRET_KEY_FIELD: AWS_SECRET_ACCESS_KEY
SOTERIA_B2_ENDPOINT_FIELD: AWS_ENDPOINTS
SOTERIA_B2_SCAN_INTERVAL_SECONDS: "900"
SOTERIA_B2_SCAN_TIMEOUT_SECONDS: "120"

View File

@ -543,6 +543,54 @@ data:
summary: "One or more PVCs are stale, missing, or failed per Soteria backup health"
labels:
severity: warning
- uid: maint-soteria-b2-scan-unhealthy
title: "Soteria B2 usage scan failing or stale"
condition: C
for: "15m"
data:
- refId: A
relativeTimeRange:
from: 1800
to: 0
datasourceUid: atlas-vm
model:
expr: sum((((soteria_b2_scan_success < bool 1) and (time() - soteria_b2_scan_timestamp_seconds > 600)) or (time() - soteria_b2_scan_timestamp_seconds > 1800))) or on() vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: soteria-b2-scan-unhealthy
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [0]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Alerting
annotations:
summary: "Soteria B2 consumption scan is failing or stale for >15m"
labels:
severity: warning
- uid: maint-soteria-authz-denials
title: "Soteria authorization denials elevated"
condition: C