maintenance(soteria): harden ingress path and add backup alerts
This commit is contained in:
parent
e1bba18b52
commit
7b3dfa335b
47
services/maintenance/NOTES.md
Normal file
47
services/maintenance/NOTES.md
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
# Soteria PVC Restore Drill (backup.bstein.dev)
|
||||||
|
|
||||||
|
Use this runbook for a minimal production-safe restore drill after each meaningful Soteria change.
|
||||||
|
|
||||||
|
## Preconditions
|
||||||
|
|
||||||
|
- `maintenance` kustomization is reconciled and healthy in Flux.
|
||||||
|
- `soteria` and `oauth2-proxy-soteria` Deployments are ready in `maintenance`.
|
||||||
|
- Operator account is in Keycloak group `admin` or `maintenance`.
|
||||||
|
- Source PVC is not ephemeral/test throwaway storage that should be excluded from backup policy.
|
||||||
|
|
||||||
|
## Operator Flow (UI)
|
||||||
|
|
||||||
|
1. Open `https://backup.bstein.dev` and sign in through Keycloak.
|
||||||
|
2. In `PVC Inventory`, pick source namespace/PVC.
|
||||||
|
3. Click `Backup now` and wait for success response in `Last Action`.
|
||||||
|
4. Click `Restore`, choose a completed backup snapshot, and set:
|
||||||
|
- `Target namespace`: destination namespace (defaults to source)
|
||||||
|
- `Target PVC name`: unique drill PVC name (`restore-<source-pvc>-<date>`)
|
||||||
|
5. Click `Create restore PVC`.
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
1. Confirm restore target exists:
|
||||||
|
- `kubectl -n <target-namespace> get pvc <target-pvc>`
|
||||||
|
2. Confirm backup telemetry is present:
|
||||||
|
- `kubectl -n monitoring port-forward svc/victoria-metrics-k8s-stack 8428:8428`
|
||||||
|
- `curl -fsS 'http://127.0.0.1:8428/api/v1/query?query=max%20by%20(namespace%2Cpvc)(pvc_backup_age_hours)'`
|
||||||
|
3. Confirm alerting input stays healthy:
|
||||||
|
- `pvc_backup_health{namespace="<source-namespace>",pvc="<source-pvc>"} == 1`
|
||||||
|
|
||||||
|
## Cleanup
|
||||||
|
|
||||||
|
1. Remove drill PVC after validation:
|
||||||
|
- `kubectl -n <target-namespace> delete pvc <target-pvc>`
|
||||||
|
2. If a detached restore Longhorn volume remains, remove it in Longhorn UI/API.
|
||||||
|
|
||||||
|
## Failure Triage
|
||||||
|
|
||||||
|
- `401/403` on UI/API:
|
||||||
|
- Verify oauth2-proxy group claims include `admin` or `maintenance`.
|
||||||
|
- Restore conflict:
|
||||||
|
- Target PVC already exists; pick a new target PVC name.
|
||||||
|
- Freshness alert firing (`maint-soteria-refresh-stale`):
|
||||||
|
- Check Soteria pod health and `/metrics` scrape reachability from `monitoring`.
|
||||||
|
- Unhealthy PVC alert firing (`maint-soteria-backup-unhealthy`):
|
||||||
|
- Inspect `pvc_backup_health` and `pvc_backup_age_hours` for stale/missing backup coverage.
|
||||||
@ -37,6 +37,7 @@ resources:
|
|||||||
- node-image-sweeper-serviceaccount.yaml
|
- node-image-sweeper-serviceaccount.yaml
|
||||||
- node-image-sweeper-daemonset.yaml
|
- node-image-sweeper-daemonset.yaml
|
||||||
- metis-service.yaml
|
- metis-service.yaml
|
||||||
|
- soteria-networkpolicy.yaml
|
||||||
- soteria-ingress.yaml
|
- soteria-ingress.yaml
|
||||||
- soteria-certificate.yaml
|
- soteria-certificate.yaml
|
||||||
- oauth2-proxy-soteria.yaml
|
- oauth2-proxy-soteria.yaml
|
||||||
|
|||||||
@ -83,8 +83,7 @@ spec:
|
|||||||
- --allowed-group=maintenance
|
- --allowed-group=maintenance
|
||||||
- --allowed-group=/maintenance
|
- --allowed-group=/maintenance
|
||||||
- --set-xauthrequest=true
|
- --set-xauthrequest=true
|
||||||
- --pass-access-token=true
|
- --pass-user-headers=true
|
||||||
- --set-authorization-header=true
|
|
||||||
- --cookie-secure=true
|
- --cookie-secure=true
|
||||||
- --cookie-samesite=lax
|
- --cookie-samesite=lax
|
||||||
- --cookie-refresh=20m
|
- --cookie-refresh=20m
|
||||||
|
|||||||
27
services/maintenance/soteria-networkpolicy.yaml
Normal file
27
services/maintenance/soteria-networkpolicy.yaml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# services/maintenance/soteria-networkpolicy.yaml
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: soteria-ingress
|
||||||
|
namespace: maintenance
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: soteria
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
ingress:
|
||||||
|
- from:
|
||||||
|
- podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: oauth2-proxy-soteria
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 8080
|
||||||
|
- from:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
kubernetes.io/metadata.name: monitoring
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 8080
|
||||||
@ -447,6 +447,102 @@ data:
|
|||||||
summary: "Legacy cronjob alert disabled"
|
summary: "Legacy cronjob alert disabled"
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
|
- uid: maint-soteria-refresh-stale
|
||||||
|
title: "Soteria inventory refresh stale (>15m)"
|
||||||
|
condition: C
|
||||||
|
for: "15m"
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 900
|
||||||
|
to: 0
|
||||||
|
datasourceUid: atlas-vm
|
||||||
|
model:
|
||||||
|
expr: time() - soteria_inventory_refresh_timestamp_seconds
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
legendFormat: soteria-refresh-age-seconds
|
||||||
|
datasource:
|
||||||
|
type: prometheus
|
||||||
|
uid: atlas-vm
|
||||||
|
- refId: B
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: A
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
reducer: last
|
||||||
|
type: reduce
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: B
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
type: threshold
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params: [900]
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
reducer:
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
noDataState: Alerting
|
||||||
|
execErrState: Alerting
|
||||||
|
annotations:
|
||||||
|
summary: "Soteria inventory telemetry has not refreshed in >15m"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- uid: maint-soteria-backup-unhealthy
|
||||||
|
title: "Soteria reports unhealthy PVC backups"
|
||||||
|
condition: C
|
||||||
|
for: "10m"
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
datasourceUid: atlas-vm
|
||||||
|
model:
|
||||||
|
expr: sum((1 - pvc_backup_health{driver="longhorn"}) > bool 0) or on() vector(0)
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
legendFormat: unhealthy-pvcs
|
||||||
|
datasource:
|
||||||
|
type: prometheus
|
||||||
|
uid: atlas-vm
|
||||||
|
- refId: B
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: A
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
reducer: last
|
||||||
|
type: reduce
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: B
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
type: threshold
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params: [0]
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
reducer:
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Alerting
|
||||||
|
annotations:
|
||||||
|
summary: "One or more PVCs are stale, missing, or failed per Soteria backup health"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
name: ariadne
|
name: ariadne
|
||||||
folder: Alerts
|
folder: Alerts
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user