From 7b3dfa335bf4fbe2bfabf306e1c642945c0deab9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 12 Apr 2026 12:12:43 -0300 Subject: [PATCH] maintenance(soteria): harden ingress path and add backup alerts --- services/maintenance/NOTES.md | 47 +++++++++ services/maintenance/kustomization.yaml | 1 + .../maintenance/oauth2-proxy-soteria.yaml | 3 +- .../maintenance/soteria-networkpolicy.yaml | 27 ++++++ .../monitoring/grafana-alerting-config.yaml | 96 +++++++++++++++++++ 5 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 services/maintenance/NOTES.md create mode 100644 services/maintenance/soteria-networkpolicy.yaml diff --git a/services/maintenance/NOTES.md b/services/maintenance/NOTES.md new file mode 100644 index 00000000..965d1ab5 --- /dev/null +++ b/services/maintenance/NOTES.md @@ -0,0 +1,47 @@ +# Soteria PVC Restore Drill (backup.bstein.dev) + +Use this runbook for a minimal production-safe restore drill after each meaningful Soteria change. + +## Preconditions + +- `maintenance` kustomization is reconciled and healthy in Flux. +- `soteria` and `oauth2-proxy-soteria` Deployments are ready in `maintenance`. +- Operator account is in Keycloak group `admin` or `maintenance`. +- Source PVC is not ephemeral/test throwaway storage that should be excluded from backup policy. + +## Operator Flow (UI) + +1. Open `https://backup.bstein.dev` and sign in through Keycloak. +2. In `PVC Inventory`, pick source namespace/PVC. +3. Click `Backup now` and wait for success response in `Last Action`. +4. Click `Restore`, choose a completed backup snapshot, and set: + - `Target namespace`: destination namespace (defaults to source) + - `Target PVC name`: unique drill PVC name (`restore--`) +5. Click `Create restore PVC`. + +## Verification + +1. Confirm restore target exists: + - `kubectl -n get pvc ` +2. Confirm backup telemetry is present: + - `kubectl -n monitoring port-forward svc/victoria-metrics-k8s-stack 8428:8428` + - `curl -fsS 'http://127.0.0.1:8428/api/v1/query?query=max%20by%20(namespace%2Cpvc)(pvc_backup_age_hours)'` +3. Confirm alerting input stays healthy: + - `pvc_backup_health{namespace="",pvc=""} == 1` + +## Cleanup + +1. Remove drill PVC after validation: + - `kubectl -n delete pvc ` +2. If a detached restore Longhorn volume remains, remove it in Longhorn UI/API. + +## Failure Triage + +- `401/403` on UI/API: + - Verify oauth2-proxy group claims include `admin` or `maintenance`. +- Restore conflict: + - Target PVC already exists; pick a new target PVC name. +- Freshness alert firing (`maint-soteria-refresh-stale`): + - Check Soteria pod health and `/metrics` scrape reachability from `monitoring`. +- Unhealthy PVC alert firing (`maint-soteria-backup-unhealthy`): + - Inspect `pvc_backup_health` and `pvc_backup_age_hours` for stale/missing backup coverage. diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 19164302..939b9ff1 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -37,6 +37,7 @@ resources: - node-image-sweeper-serviceaccount.yaml - node-image-sweeper-daemonset.yaml - metis-service.yaml + - soteria-networkpolicy.yaml - soteria-ingress.yaml - soteria-certificate.yaml - oauth2-proxy-soteria.yaml diff --git a/services/maintenance/oauth2-proxy-soteria.yaml b/services/maintenance/oauth2-proxy-soteria.yaml index cd6dabe7..92776ff9 100644 --- a/services/maintenance/oauth2-proxy-soteria.yaml +++ b/services/maintenance/oauth2-proxy-soteria.yaml @@ -83,8 +83,7 @@ spec: - --allowed-group=maintenance - --allowed-group=/maintenance - --set-xauthrequest=true - - --pass-access-token=true - - --set-authorization-header=true + - --pass-user-headers=true - --cookie-secure=true - --cookie-samesite=lax - --cookie-refresh=20m diff --git a/services/maintenance/soteria-networkpolicy.yaml b/services/maintenance/soteria-networkpolicy.yaml new file mode 100644 index 00000000..fe6ff33c --- /dev/null +++ b/services/maintenance/soteria-networkpolicy.yaml @@ -0,0 +1,27 @@ +# services/maintenance/soteria-networkpolicy.yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: soteria-ingress + namespace: maintenance +spec: + podSelector: + matchLabels: + app: soteria + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + app: oauth2-proxy-soteria + ports: + - protocol: TCP + port: 8080 + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: monitoring + ports: + - protocol: TCP + port: 8080 diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index d7b39a2b..5f240bc0 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -447,6 +447,102 @@ data: summary: "Legacy cronjob alert disabled" labels: severity: info + - uid: maint-soteria-refresh-stale + title: "Soteria inventory refresh stale (>15m)" + condition: C + for: "15m" + data: + - refId: A + relativeTimeRange: + from: 900 + to: 0 + datasourceUid: atlas-vm + model: + expr: time() - soteria_inventory_refresh_timestamp_seconds + intervalMs: 60000 + maxDataPoints: 43200 + legendFormat: soteria-refresh-age-seconds + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [900] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: Alerting + execErrState: Alerting + annotations: + summary: "Soteria inventory telemetry has not refreshed in >15m" + labels: + severity: warning + - uid: maint-soteria-backup-unhealthy + title: "Soteria reports unhealthy PVC backups" + condition: C + for: "10m" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + expr: sum((1 - pvc_backup_health{driver="longhorn"}) > bool 0) or on() vector(0) + intervalMs: 60000 + maxDataPoints: 43200 + legendFormat: unhealthy-pvcs + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [0] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: Alerting + annotations: + summary: "One or more PVCs are stale, missing, or failed per Soteria backup health" + labels: + severity: warning - orgId: 1 name: ariadne folder: Alerts