From 7b3dfa335bf4fbe2bfabf306e1c642945c0deab9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Sun, 12 Apr 2026 12:12:43 -0300
Subject: [PATCH] maintenance(soteria): harden ingress path and add backup
 alerts

---
 services/maintenance/NOTES.md                 | 47 +++++++++
 services/maintenance/kustomization.yaml       |  1 +
 .../maintenance/oauth2-proxy-soteria.yaml     |  3 +-
 .../maintenance/soteria-networkpolicy.yaml    | 27 ++++++
 .../monitoring/grafana-alerting-config.yaml   | 96 +++++++++++++++++++
 5 files changed, 172 insertions(+), 2 deletions(-)
 create mode 100644 services/maintenance/NOTES.md
 create mode 100644 services/maintenance/soteria-networkpolicy.yaml
diff --git a/services/maintenance/NOTES.md b/services/maintenance/NOTES.md
new file mode 100644
index 00000000..965d1ab5
--- /dev/null
+++ b/services/maintenance/NOTES.md
@@ -0,0 +1,47 @@
+# Soteria PVC Restore Drill (backup.bstein.dev)
+
+Use this runbook for a minimal production-safe restore drill after each meaningful Soteria change.
+
+## Preconditions
+
+- `maintenance` kustomization is reconciled and healthy in Flux.
+- `soteria` and `oauth2-proxy-soteria` Deployments are ready in `maintenance`.
+- Operator account is in Keycloak group `admin` or `maintenance`.
+- Source PVC is not ephemeral/test throwaway storage that should be excluded from backup policy.
+
+## Operator Flow (UI)
+
+1. Open `https://backup.bstein.dev` and sign in through Keycloak.
+2. In `PVC Inventory`, pick source namespace/PVC.
+3. Click `Backup now` and wait for success response in `Last Action`.
+4. Click `Restore`, choose a completed backup snapshot, and set:
+   - `Target namespace`: destination namespace (defaults to source)
+   - `Target PVC name`: unique drill PVC name (`restore-<source-pvc>-<date>`)
+5. Click `Create restore PVC`.
+
+## Verification
+
+1. Confirm restore target exists:
+   - `kubectl -n <target-namespace> get pvc <target-pvc>`
+2. Confirm backup telemetry is present:
+   - `kubectl -n monitoring port-forward svc/victoria-metrics-k8s-stack 8428:8428`
+   - `curl -fsS 'http://127.0.0.1:8428/api/v1/query?query=max%20by%20(namespace%2Cpvc)(pvc_backup_age_hours)'`
+3. Confirm alerting input stays healthy:
+   - `pvc_backup_health{namespace="<source-namespace>",pvc="<source-pvc>"} == 1`
+
+## Cleanup
+
+1. Remove drill PVC after validation:
+   - `kubectl -n <target-namespace> delete pvc <target-pvc>`
+2. If a detached restore Longhorn volume remains, remove it in Longhorn UI/API.
+
+## Failure Triage
+
+- `401/403` on UI/API:
+  - Verify oauth2-proxy group claims include `admin` or `maintenance`.
+- Restore conflict:
+  - Target PVC already exists; pick a new target PVC name.
+- Freshness alert firing (`maint-soteria-refresh-stale`):
+  - Check Soteria pod health and `/metrics` scrape reachability from `monitoring`.
+- Unhealthy PVC alert firing (`maint-soteria-backup-unhealthy`):
+  - Inspect `pvc_backup_health` and `pvc_backup_age_hours` for stale/missing backup coverage.
diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 19164302..939b9ff1 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -37,6 +37,7 @@ resources:
   - node-image-sweeper-serviceaccount.yaml
   - node-image-sweeper-daemonset.yaml
   - metis-service.yaml
+  - soteria-networkpolicy.yaml
   - soteria-ingress.yaml
   - soteria-certificate.yaml
   - oauth2-proxy-soteria.yaml
diff --git a/services/maintenance/oauth2-proxy-soteria.yaml b/services/maintenance/oauth2-proxy-soteria.yaml
index cd6dabe7..92776ff9 100644
--- a/services/maintenance/oauth2-proxy-soteria.yaml
+++ b/services/maintenance/oauth2-proxy-soteria.yaml
@@ -83,8 +83,7 @@ spec:
             - --allowed-group=maintenance
             - --allowed-group=/maintenance
             - --set-xauthrequest=true
-            - --pass-access-token=true
-            - --set-authorization-header=true
+            - --pass-user-headers=true
             - --cookie-secure=true
             - --cookie-samesite=lax
             - --cookie-refresh=20m
diff --git a/services/maintenance/soteria-networkpolicy.yaml b/services/maintenance/soteria-networkpolicy.yaml
new file mode 100644
index 00000000..fe6ff33c
--- /dev/null
+++ b/services/maintenance/soteria-networkpolicy.yaml
@@ -0,0 +1,27 @@
+# services/maintenance/soteria-networkpolicy.yaml
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: soteria-ingress
+  namespace: maintenance
+spec:
+  podSelector:
+    matchLabels:
+      app: soteria
+  policyTypes:
+    - Ingress
+  ingress:
+    - from:
+        - podSelector:
+            matchLabels:
+              app: oauth2-proxy-soteria
+      ports:
+        - protocol: TCP
+          port: 8080
+    - from:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: monitoring
+      ports:
+        - protocol: TCP
+          port: 8080
diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml
index d7b39a2b..5f240bc0 100644
--- a/services/monitoring/grafana-alerting-config.yaml
+++ b/services/monitoring/grafana-alerting-config.yaml
@@ -447,6 +447,102 @@ data:
               summary: "Legacy cronjob alert disabled"
             labels:
               severity: info
+          - uid: maint-soteria-refresh-stale
+            title: "Soteria inventory refresh stale (>15m)"
+            condition: C
+            for: "15m"
+            data:
+              - refId: A
+                relativeTimeRange:
+                  from: 900
+                  to: 0
+                datasourceUid: atlas-vm
+                model:
+                  expr: time() - soteria_inventory_refresh_timestamp_seconds
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  legendFormat: soteria-refresh-age-seconds
+                  datasource:
+                    type: prometheus
+                    uid: atlas-vm
+              - refId: B
+                datasourceUid: __expr__
+                model:
+                  expression: A
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  reducer: last
+                  type: reduce
+              - refId: C
+                datasourceUid: __expr__
+                model:
+                  expression: B
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  type: threshold
+                  conditions:
+                    - evaluator:
+                        params: [900]
+                        type: gt
+                      operator:
+                        type: and
+                      reducer:
+                        type: last
+                      type: query
+            noDataState: Alerting
+            execErrState: Alerting
+            annotations:
+              summary: "Soteria inventory telemetry has not refreshed in >15m"
+            labels:
+              severity: warning
+          - uid: maint-soteria-backup-unhealthy
+            title: "Soteria reports unhealthy PVC backups"
+            condition: C
+            for: "10m"
+            data:
+              - refId: A
+                relativeTimeRange:
+                  from: 600
+                  to: 0
+                datasourceUid: atlas-vm
+                model:
+                  expr: sum((1 - pvc_backup_health{driver="longhorn"}) > bool 0) or on() vector(0)
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  legendFormat: unhealthy-pvcs
+                  datasource:
+                    type: prometheus
+                    uid: atlas-vm
+              - refId: B
+                datasourceUid: __expr__
+                model:
+                  expression: A
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  reducer: last
+                  type: reduce
+              - refId: C
+                datasourceUid: __expr__
+                model:
+                  expression: B
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  type: threshold
+                  conditions:
+                    - evaluator:
+                        params: [0]
+                        type: gt
+                      operator:
+                        type: and
+                      reducer:
+                        type: last
+                      type: query
+            noDataState: OK
+            execErrState: Alerting
+            annotations:
+              summary: "One or more PVCs are stale, missing, or failed per Soteria backup health"
+            labels:
+              severity: warning
       - orgId: 1
         name: ariadne
         folder: Alerts