Compare commits

...

5 Commits

18 changed files with 775 additions and 6 deletions

View File

@ -475,7 +475,7 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
)
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))"
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))"
ANANKE_SELECTOR = 'job="ananke-power"'
ANANKE_UPS_DB_NAME = "Pyrphoros"
ANANKE_UPS_DB_NODE = "titan-db"
@ -1566,7 +1566,7 @@ def build_overview():
)
panels[-1]["links"] = link_to("atlas-storage")
panels[-1]["description"] = (
"Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
"Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
)
panels.append(

View File

@ -23,6 +23,7 @@ resources:
- oneoffs/synapse-oidc-secret-ensure-job.yaml
- oneoffs/logs-oidc-secret-ensure-job.yaml
- oneoffs/metis-oidc-secret-ensure-job.yaml
- oneoffs/soteria-oidc-secret-ensure-job.yaml
- oneoffs/metis-ssh-keys-secret-ensure-job.yaml
- oneoffs/harbor-oidc-secret-ensure-job.yaml
- oneoffs/vault-oidc-secret-ensure-job.yaml

View File

@ -0,0 +1,198 @@
# services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml
# One-off job for sso/soteria-oidc-secret-ensure-1.
# Purpose: ensure the Soteria oauth2-proxy OIDC client and Vault secret exist.
# Keep this completed Job around; bump the suffix if it ever needs to be rerun.
apiVersion: batch/v1
kind: Job
metadata:
name: soteria-oidc-secret-ensure-1
namespace: sso
spec:
backoffLimit: 0
template:
metadata:
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "sso-secrets"
vault.hashicorp.com/agent-inject-secret-keycloak-admin-env.sh: "kv/data/atlas/shared/keycloak-admin"
vault.hashicorp.com/agent-inject-template-keycloak-admin-env.sh: |
{{ with secret "kv/data/atlas/shared/keycloak-admin" }}
export KEYCLOAK_ADMIN="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_USER="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_PASSWORD="{{ .Data.data.password }}"
{{ end }}
spec:
serviceAccountName: mas-secrets-ensure
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: apply
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/bin/sh", "-c"]
args:
- |
set -euo pipefail
. /vault/secrets/keycloak-admin-env.sh
KC_URL="http://keycloak.sso.svc.cluster.local"
ACCESS_TOKEN=""
for attempt in 1 2 3 4 5; do
TOKEN_JSON="$(curl -sS -X POST "$KC_URL/realms/master/protocol/openid-connect/token" \
-H 'Content-Type: application/x-www-form-urlencoded' \
-d "grant_type=password" \
-d "client_id=admin-cli" \
-d "username=${KEYCLOAK_ADMIN}" \
-d "password=${KEYCLOAK_ADMIN_PASSWORD}" || true)"
ACCESS_TOKEN="$(echo "$TOKEN_JSON" | jq -r '.access_token' 2>/dev/null || true)"
if [ -n "$ACCESS_TOKEN" ] && [ "$ACCESS_TOKEN" != "null" ]; then
break
fi
echo "Keycloak token request failed (attempt ${attempt})" >&2
sleep $((attempt * 2))
done
if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then
echo "Failed to fetch Keycloak admin token" >&2
exit 1
fi
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients?clientId=soteria" || true)"
CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
create_payload='{"clientId":"soteria","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://backup.bstein.dev/oauth2/callback"],"webOrigins":["https://backup.bstein.dev"],"rootUrl":"https://backup.bstein.dev","baseUrl":"/"}'
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${create_payload}" \
"$KC_URL/admin/realms/atlas/clients")"
if [ "$status" != "201" ] && [ "$status" != "204" ] && [ "$status" != "409" ]; then
echo "Keycloak client create failed (status ${status})" >&2
exit 1
fi
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients?clientId=soteria" || true)"
CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
fi
if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
echo "Keycloak client soteria not found" >&2
exit 1
fi
SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/client-scopes?search=groups" | jq -r '.[] | select(.name=="groups") | .id' 2>/dev/null | head -n1 || true)"
if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then
echo "Keycloak client scope groups not found" >&2
exit 1
fi
DEFAULT_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/default-client-scopes" || true)"
OPTIONAL_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes" || true)"
if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1 \
&& ! echo "$OPTIONAL_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1; then
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")"
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")"
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
echo "Failed to attach groups client scope to soteria (status ${status})" >&2
exit 1
fi
fi
fi
update_payload='{"enabled":true,"clientId":"soteria","protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://backup.bstein.dev/oauth2/callback"],"webOrigins":["https://backup.bstein.dev"],"rootUrl":"https://backup.bstein.dev","baseUrl":"/"}'
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${update_payload}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}")"
if [ "$status" != "204" ]; then
echo "Keycloak client update failed (status ${status})" >&2
exit 1
fi
CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)"
if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then
echo "Keycloak client secret not found" >&2
exit 1
fi
vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}"
vault_role="${VAULT_ROLE:-sso-secrets}"
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')"
vault_token="$(curl -sS --request POST --data "${login_payload}" \
"${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')"
if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then
echo "vault login failed" >&2
exit 1
fi
read_status="$(curl -sS -o /tmp/soteria-oidc-read.json -w "%{http_code}" \
-H "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc" || true)"
COOKIE_SECRET=""
if [ "${read_status}" = "200" ]; then
COOKIE_SECRET="$(jq -r '.data.data.cookie_secret // empty' /tmp/soteria-oidc-read.json)"
elif [ "${read_status}" != "404" ]; then
echo "Vault read failed (status ${read_status})" >&2
cat /tmp/soteria-oidc-read.json >&2 || true
exit 1
fi
if [ -n "${COOKIE_SECRET}" ]; then
length="$(printf '%s' "${COOKIE_SECRET}" | wc -c | tr -d ' ')"
if [ "${length}" != "16" ] && [ "${length}" != "24" ] && [ "${length}" != "32" ]; then
COOKIE_SECRET=""
fi
fi
if [ -z "${COOKIE_SECRET}" ]; then
COOKIE_SECRET="$(openssl rand -hex 16 | tr -d '\n')"
fi
payload="$(jq -nc \
--arg client_id "soteria" \
--arg client_secret "${CLIENT_SECRET}" \
--arg cookie_secret "${COOKIE_SECRET}" \
'{data:{client_id:$client_id,client_secret:$client_secret,cookie_secret:$cookie_secret}}')"
write_status="$(curl -sS -o /tmp/soteria-oidc-write.json -w "%{http_code}" -X POST \
-H "X-Vault-Token: ${vault_token}" \
-H 'Content-Type: application/json' \
-d "${payload}" "${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc")"
if [ "${write_status}" != "200" ] && [ "${write_status}" != "204" ]; then
echo "Vault write failed (status ${write_status})" >&2
cat /tmp/soteria-oidc-write.json >&2 || true
exit 1
fi
verify_status="$(curl -sS -o /tmp/soteria-oidc-verify.json -w "%{http_code}" \
-H "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc" || true)"
if [ "${verify_status}" != "200" ]; then
echo "Vault verify failed (status ${verify_status})" >&2
cat /tmp/soteria-oidc-verify.json >&2 || true
exit 1
fi
echo "Soteria OIDC secret ready in Vault"

View File

@ -0,0 +1,60 @@
# Soteria PVC Restore Drill (backup.bstein.dev)
Use this checklist after meaningful Soteria backup, restore, auth, or alerting changes.
## Production Restore Drill Checklist
1. Verify baseline health before touching restores.
- `flux get kustomizations -n flux-system maintenance`
- `kubectl -n maintenance get deploy soteria oauth2-proxy-soteria`
2. Confirm operator access and source safety.
- Operator must be in Keycloak group `admin` or `maintenance`.
- Choose a real source PVC that is expected to be backed up, not a throwaway test PVC.
3. Run the UI flow at `https://backup.bstein.dev`.
- Sign in via Keycloak.
- In `PVC Inventory`, select source namespace and PVC.
- Click `Backup now` and wait for success in `Last Action`.
- Click `Restore` and pick a completed snapshot.
- Set `Target namespace` and unique `Target PVC name` (`restore-<source-pvc>-<date>`).
- Click `Create restore PVC`.
4. Validate restore output.
- `kubectl -n <target-namespace> get pvc <target-pvc>`
- If workload-level validation is required, attach a temporary pod and inspect expected files/data.
5. Clean up.
- `kubectl -n <target-namespace> delete pvc <target-pvc>`
- Remove detached restore Longhorn volume from Longhorn UI/API if one remains.
## Alert Query Verification (`maint-soteria-*`)
Start a local query endpoint:
`kubectl -n monitoring port-forward svc/victoria-metrics-k8s-stack 8428:8428`
Validate each alert expression directly.
1. `maint-soteria-refresh-stale` (`time() - soteria_inventory_refresh_timestamp_seconds`, threshold `> 900`).
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=time() - soteria_inventory_refresh_timestamp_seconds'`
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=(time() - soteria_inventory_refresh_timestamp_seconds) > bool 900'`
- Healthy expectation: age is below `900` and threshold query returns `0`.
2. `maint-soteria-backup-unhealthy` (`sum((1 - pvc_backup_health{driver="longhorn"}) > bool 0) or on() vector(0)`, threshold `> 0`).
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=sum((1 - pvc_backup_health{driver="longhorn"}) > bool 0) or on() vector(0)'`
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=(1 - pvc_backup_health{driver="longhorn"}) > bool 0'`
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=max by (namespace,pvc) (pvc_backup_age_hours{driver="longhorn"})'`
- Healthy expectation: unhealthy count is `0`; no series should be `1` in the per-PVC unhealthy query.
3. `maint-soteria-authz-denials` (`sum(increase(soteria_authz_denials_total[15m])) or on() vector(0)`, threshold `> 9` for 10m).
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=sum(increase(soteria_authz_denials_total[15m])) or on() vector(0)'`
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=sum by (reason) (increase(soteria_authz_denials_total[15m]))'`
- Healthy expectation: total remains below `10` in normal operation; spikes should map to expected `reason` labels.
## Failure Triage
- `401/403` on UI or API:
- Verify oauth2-proxy group claims include `admin` or `maintenance`.
- Restore conflict:
- Target PVC already exists; choose a new target PVC name.
- `maint-soteria-refresh-stale` firing:
- Check Soteria pod health and `/metrics` scrape reachability from `monitoring`.
- `maint-soteria-backup-unhealthy` firing:
- Inspect `pvc_backup_health` and `pvc_backup_age_hours` to identify stale or missing backups.
- `maint-soteria-authz-denials` firing:
- Confirm expected OIDC groups and inspect denial `reason` labels for policy or header regressions.

View File

@ -35,6 +35,11 @@ resources:
- node-image-sweeper-daemonset.yaml
- image-sweeper-cronjob.yaml
- metis-service.yaml
- soteria-networkpolicy.yaml
- oauth2-proxy-soteria-networkpolicy.yaml
- soteria-ingress.yaml
- soteria-certificate.yaml
- oauth2-proxy-soteria.yaml
- oauth2-proxy-metis.yaml
- metis-certificate.yaml
- metis-ingress.yaml
@ -43,6 +48,8 @@ images:
newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
- name: registry.bstein.dev/bstein/metis
newTag: 0.1.0-9-amd64
- name: registry.bstein.dev/bstein/soteria
newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:soteria:tag"}
configMapGenerator:
- name: disable-k3s-traefik-script
namespace: maintenance

View File

@ -0,0 +1,23 @@
# services/maintenance/oauth2-proxy-soteria-networkpolicy.yaml
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: oauth2-proxy-soteria-ingress
namespace: maintenance
spec:
podSelector:
matchLabels:
app: oauth2-proxy-soteria
policyTypes:
- Ingress
ingress:
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: traefik
podSelector:
matchLabels:
app: traefik
ports:
- protocol: TCP
port: 4180

View File

@ -0,0 +1,120 @@
# services/maintenance/oauth2-proxy-soteria.yaml
apiVersion: v1
kind: Service
metadata:
name: oauth2-proxy-soteria
namespace: maintenance
labels:
app: oauth2-proxy-soteria
spec:
ports:
- name: http
port: 80
targetPort: 4180
selector:
app: oauth2-proxy-soteria
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: oauth2-proxy-soteria
namespace: maintenance
labels:
app: oauth2-proxy-soteria
spec:
replicas: 2
selector:
matchLabels:
app: oauth2-proxy-soteria
template:
metadata:
labels:
app: oauth2-proxy-soteria
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "maintenance"
vault.hashicorp.com/agent-inject-secret-oidc-config: "kv/data/atlas/maintenance/soteria-oidc"
vault.hashicorp.com/agent-inject-template-oidc-config: |
{{- with secret "kv/data/atlas/maintenance/soteria-oidc" -}}
client_id = "{{ .Data.data.client_id }}"
client_secret = "{{ .Data.data.client_secret }}"
cookie_secret = "{{ .Data.data.cookie_secret }}"
{{- end -}}
spec:
serviceAccountName: maintenance-vault-sync
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["amd64","arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-13","titan-15","titan-17","titan-19"]
containers:
- name: oauth2-proxy
image: quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
imagePullPolicy: IfNotPresent
args:
- --provider=oidc
- --config=/vault/secrets/oidc-config
- --redirect-url=https://backup.bstein.dev/oauth2/callback
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
- --scope=openid profile email groups
- --email-domain=*
- --allowed-group=admin
- --allowed-group=/admin
- --allowed-group=maintenance
- --allowed-group=/maintenance
- --set-xauthrequest=true
- --pass-user-headers=true
- --cookie-secure=true
- --cookie-samesite=lax
- --cookie-refresh=20m
- --cookie-expire=168h
- --insecure-oidc-allow-unverified-email=true
- --upstream=http://soteria.maintenance.svc.cluster.local
- --http-address=0.0.0.0:4180
- --skip-provider-button=true
- --approval-prompt=auto
- --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups
- --cookie-domain=backup.bstein.dev
ports:
- containerPort: 4180
name: http
readinessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 20
periodSeconds: 20
resources:
requests:
cpu: 25m
memory: 64Mi
limits:
cpu: 250m
memory: 256Mi

View File

@ -0,0 +1,13 @@
# services/maintenance/soteria-certificate.yaml
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: backup-tls
namespace: maintenance
spec:
secretName: backup-tls
issuerRef:
kind: ClusterIssuer
name: letsencrypt
dnsNames:
- backup.bstein.dev

View File

@ -0,0 +1,14 @@
# services/maintenance/soteria-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: soteria
namespace: maintenance
data:
SOTERIA_BACKUP_DRIVER: longhorn
SOTERIA_LONGHORN_URL: http://longhorn-backend.longhorn-system.svc:9500
SOTERIA_LONGHORN_BACKUP_MODE: incremental
SOTERIA_AUTH_REQUIRED: "true"
SOTERIA_ALLOWED_GROUPS: admin,maintenance
SOTERIA_BACKUP_MAX_AGE_HOURS: "24"
SOTERIA_METRICS_REFRESH_SECONDS: "300"

View File

@ -0,0 +1,76 @@
# services/maintenance/soteria-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: soteria
namespace: maintenance
labels:
app: soteria
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: soteria
template:
metadata:
labels:
app: soteria
spec:
serviceAccountName: soteria
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
containers:
- name: soteria
image: registry.bstein.dev/bstein/soteria:0.1.0-21
imagePullPolicy: Always
envFrom:
- configMapRef:
name: soteria
ports:
- name: http
containerPort: 8080
livenessProbe:
httpGet:
path: /healthz
port: http
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 2
readinessProbe:
httpGet:
path: /readyz
port: http
initialDelaySeconds: 2
periodSeconds: 5
timeoutSeconds: 2
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
runAsNonRoot: true
runAsUser: 65532

View File

@ -0,0 +1,27 @@
# services/maintenance/soteria-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: soteria
namespace: maintenance
annotations:
kubernetes.io/ingress.class: traefik
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.middlewares: ""
spec:
ingressClassName: traefik
tls:
- hosts: ["backup.bstein.dev"]
secretName: backup-tls
rules:
- host: backup.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: oauth2-proxy-soteria
port:
number: 80

View File

@ -0,0 +1,27 @@
# services/maintenance/soteria-networkpolicy.yaml
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: soteria-ingress
namespace: maintenance
spec:
podSelector:
matchLabels:
app: soteria
policyTypes:
- Ingress
ingress:
- from:
- podSelector:
matchLabels:
app: oauth2-proxy-soteria
ports:
- protocol: TCP
port: 8080
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: monitoring
ports:
- protocol: TCP
port: 8080

View File

@ -0,0 +1,29 @@
# services/maintenance/soteria-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: soteria
rules:
- apiGroups: [""]
resources: ["persistentvolumeclaims", "persistentvolumes"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["secrets"]
verbs: ["get", "list", "create", "update", "delete"]
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["get", "list", "create", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: soteria
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: soteria
subjects:
- kind: ServiceAccount
name: soteria
namespace: maintenance

View File

@ -0,0 +1,21 @@
# services/maintenance/soteria-service.yaml
apiVersion: v1
kind: Service
metadata:
name: soteria
namespace: maintenance
labels:
app: soteria
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "80"
prometheus.io/path: "/metrics"
spec:
type: ClusterIP
selector:
app: soteria
ports:
- name: http
port: 80
targetPort: http

View File

@ -0,0 +1,9 @@
# services/maintenance/soteria-serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: soteria
namespace: maintenance
imagePullSecrets:
- name: harbor-regcred

View File

@ -1970,7 +1970,7 @@
},
"targets": [
{
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
@ -2034,7 +2034,7 @@
"targetBlank": true
}
],
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
"description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
},
{
"id": 30,

View File

@ -447,6 +447,150 @@ data:
summary: "Legacy cronjob alert disabled"
labels:
severity: info
- uid: maint-soteria-refresh-stale
title: "Soteria inventory refresh stale (>15m)"
condition: C
for: "15m"
data:
- refId: A
relativeTimeRange:
from: 900
to: 0
datasourceUid: atlas-vm
model:
expr: time() - soteria_inventory_refresh_timestamp_seconds
intervalMs: 60000
maxDataPoints: 43200
legendFormat: soteria-refresh-age-seconds
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [900]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: Alerting
execErrState: Alerting
annotations:
summary: "Soteria inventory telemetry has not refreshed in >15m"
labels:
severity: warning
- uid: maint-soteria-backup-unhealthy
title: "Soteria reports unhealthy PVC backups"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
expr: sum((1 - pvc_backup_health{driver="longhorn"}) > bool 0) or on() vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: unhealthy-pvcs
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [0]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Alerting
annotations:
summary: "One or more PVCs are stale, missing, or failed per Soteria backup health"
labels:
severity: warning
- uid: maint-soteria-authz-denials
title: "Soteria authorization denials elevated"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 900
to: 0
datasourceUid: atlas-vm
model:
expr: sum(increase(soteria_authz_denials_total[15m])) or on() vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: soteria-authz-denials-15m
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [10]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Alerting
annotations:
summary: "Soteria saw >10 authorization denials in 15m"
labels:
severity: warning
- orgId: 1
name: ariadne
folder: Alerts

View File

@ -1979,7 +1979,7 @@ data:
},
"targets": [
{
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
@ -2043,7 +2043,7 @@ data:
"targetBlank": true
}
],
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
"description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
},
{
"id": 30,