diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 8dd826f0..f577cfc4 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -54,7 +54,7 @@ images: - name: registry.bstein.dev/bstein/metis newTag: 0.1.0-9-amd64 - name: registry.bstein.dev/bstein/soteria - newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:soteria:tag"} + newTag: 0.1.0-25 # {"$imagepolicy": "maintenance:soteria:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance diff --git a/services/maintenance/soteria-configmap.yaml b/services/maintenance/soteria-configmap.yaml index fdf666e9..e2085eca 100644 --- a/services/maintenance/soteria-configmap.yaml +++ b/services/maintenance/soteria-configmap.yaml @@ -12,3 +12,11 @@ data: SOTERIA_ALLOWED_GROUPS: admin,maintenance SOTERIA_BACKUP_MAX_AGE_HOURS: "24" SOTERIA_METRICS_REFRESH_SECONDS: "300" + SOTERIA_B2_ENABLED: "true" + SOTERIA_B2_SECRET_NAMESPACE: maintenance + SOTERIA_B2_SECRET_NAME: soteria-restic + SOTERIA_B2_ACCESS_KEY_FIELD: AWS_ACCESS_KEY_ID + SOTERIA_B2_SECRET_KEY_FIELD: AWS_SECRET_ACCESS_KEY + SOTERIA_B2_ENDPOINT_FIELD: AWS_ENDPOINTS + SOTERIA_B2_SCAN_INTERVAL_SECONDS: "900" + SOTERIA_B2_SCAN_TIMEOUT_SECONDS: "120" diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index 0dd06bf5..0c9610df 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -543,6 +543,54 @@ data: summary: "One or more PVCs are stale, missing, or failed per Soteria backup health" labels: severity: warning + - uid: maint-soteria-b2-scan-unhealthy + title: "Soteria B2 usage scan failing or stale" + condition: C + for: "15m" + data: + - refId: A + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: atlas-vm + model: + expr: sum((((soteria_b2_scan_success < bool 1) and (time() - soteria_b2_scan_timestamp_seconds > 600)) or (time() - soteria_b2_scan_timestamp_seconds > 1800))) or on() vector(0) + intervalMs: 60000 + maxDataPoints: 43200 + legendFormat: soteria-b2-scan-unhealthy + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [0] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: Alerting + annotations: + summary: "Soteria B2 consumption scan is failing or stale for >15m" + labels: + severity: warning - uid: maint-soteria-authz-denials title: "Soteria authorization denials elevated" condition: C