maintenance(soteria): add protected UI, OIDC bootstrap, and backup health panel wiring

This commit is contained in:
Brad Stein 2026-04-12 11:16:29 -03:00
parent 95bc3953d1
commit 96f923ae4c
12 changed files with 481 additions and 7 deletions

View File

@ -538,7 +538,7 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
)
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))"
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))"
ANANKE_SELECTOR = 'job="ananke-power"'
ANANKE_UPS_DB_NAME = "Pyrphoros"
ANANKE_UPS_DB_NODE = "titan-db"
@ -1627,7 +1627,7 @@ def build_overview():
)
panels[-1]["links"] = link_to("atlas-storage")
panels[-1]["description"] = (
"Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
"Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
)
panels.append(

View File

@ -23,6 +23,7 @@ resources:
- oneoffs/synapse-oidc-secret-ensure-job.yaml
- oneoffs/logs-oidc-secret-ensure-job.yaml
- oneoffs/metis-oidc-secret-ensure-job.yaml
- oneoffs/soteria-oidc-secret-ensure-job.yaml
- oneoffs/metis-ssh-keys-secret-ensure-job.yaml
- oneoffs/harbor-oidc-secret-ensure-job.yaml
- oneoffs/vault-oidc-secret-ensure-job.yaml

View File

@ -0,0 +1,198 @@
# services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml
# One-off job for sso/soteria-oidc-secret-ensure-1.
# Purpose: ensure the Soteria oauth2-proxy OIDC client and Vault secret exist.
# Keep this completed Job around; bump the suffix if it ever needs to be rerun.
apiVersion: batch/v1
kind: Job
metadata:
name: soteria-oidc-secret-ensure-1
namespace: sso
spec:
backoffLimit: 0
template:
metadata:
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "sso-secrets"
vault.hashicorp.com/agent-inject-secret-keycloak-admin-env.sh: "kv/data/atlas/shared/keycloak-admin"
vault.hashicorp.com/agent-inject-template-keycloak-admin-env.sh: |
{{ with secret "kv/data/atlas/shared/keycloak-admin" }}
export KEYCLOAK_ADMIN="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_USER="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_PASSWORD="{{ .Data.data.password }}"
{{ end }}
spec:
serviceAccountName: mas-secrets-ensure
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: apply
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/bin/sh", "-c"]
args:
- |
set -euo pipefail
. /vault/secrets/keycloak-admin-env.sh
KC_URL="http://keycloak.sso.svc.cluster.local"
ACCESS_TOKEN=""
for attempt in 1 2 3 4 5; do
TOKEN_JSON="$(curl -sS -X POST "$KC_URL/realms/master/protocol/openid-connect/token" \
-H 'Content-Type: application/x-www-form-urlencoded' \
-d "grant_type=password" \
-d "client_id=admin-cli" \
-d "username=${KEYCLOAK_ADMIN}" \
-d "password=${KEYCLOAK_ADMIN_PASSWORD}" || true)"
ACCESS_TOKEN="$(echo "$TOKEN_JSON" | jq -r '.access_token' 2>/dev/null || true)"
if [ -n "$ACCESS_TOKEN" ] && [ "$ACCESS_TOKEN" != "null" ]; then
break
fi
echo "Keycloak token request failed (attempt ${attempt})" >&2
sleep $((attempt * 2))
done
if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then
echo "Failed to fetch Keycloak admin token" >&2
exit 1
fi
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients?clientId=soteria" || true)"
CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
create_payload='{"clientId":"soteria","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://backup.bstein.dev/oauth2/callback"],"webOrigins":["https://backup.bstein.dev"],"rootUrl":"https://backup.bstein.dev","baseUrl":"/"}'
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${create_payload}" \
"$KC_URL/admin/realms/atlas/clients")"
if [ "$status" != "201" ] && [ "$status" != "204" ] && [ "$status" != "409" ]; then
echo "Keycloak client create failed (status ${status})" >&2
exit 1
fi
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients?clientId=soteria" || true)"
CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
fi
if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
echo "Keycloak client soteria not found" >&2
exit 1
fi
SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/client-scopes?search=groups" | jq -r '.[] | select(.name=="groups") | .id' 2>/dev/null | head -n1 || true)"
if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then
echo "Keycloak client scope groups not found" >&2
exit 1
fi
DEFAULT_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/default-client-scopes" || true)"
OPTIONAL_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes" || true)"
if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1 \
&& ! echo "$OPTIONAL_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1; then
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")"
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")"
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
echo "Failed to attach groups client scope to soteria (status ${status})" >&2
exit 1
fi
fi
fi
update_payload='{"enabled":true,"clientId":"soteria","protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://backup.bstein.dev/oauth2/callback"],"webOrigins":["https://backup.bstein.dev"],"rootUrl":"https://backup.bstein.dev","baseUrl":"/"}'
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${update_payload}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}")"
if [ "$status" != "204" ]; then
echo "Keycloak client update failed (status ${status})" >&2
exit 1
fi
CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)"
if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then
echo "Keycloak client secret not found" >&2
exit 1
fi
vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}"
vault_role="${VAULT_ROLE:-sso-secrets}"
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')"
vault_token="$(curl -sS --request POST --data "${login_payload}" \
"${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')"
if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then
echo "vault login failed" >&2
exit 1
fi
read_status="$(curl -sS -o /tmp/soteria-oidc-read.json -w "%{http_code}" \
-H "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc" || true)"
COOKIE_SECRET=""
if [ "${read_status}" = "200" ]; then
COOKIE_SECRET="$(jq -r '.data.data.cookie_secret // empty' /tmp/soteria-oidc-read.json)"
elif [ "${read_status}" != "404" ]; then
echo "Vault read failed (status ${read_status})" >&2
cat /tmp/soteria-oidc-read.json >&2 || true
exit 1
fi
if [ -n "${COOKIE_SECRET}" ]; then
length="$(printf '%s' "${COOKIE_SECRET}" | wc -c | tr -d ' ')"
if [ "${length}" != "16" ] && [ "${length}" != "24" ] && [ "${length}" != "32" ]; then
COOKIE_SECRET=""
fi
fi
if [ -z "${COOKIE_SECRET}" ]; then
COOKIE_SECRET="$(openssl rand -hex 16 | tr -d '\n')"
fi
payload="$(jq -nc \
--arg client_id "soteria" \
--arg client_secret "${CLIENT_SECRET}" \
--arg cookie_secret "${COOKIE_SECRET}" \
'{data:{client_id:$client_id,client_secret:$client_secret,cookie_secret:$cookie_secret}}')"
write_status="$(curl -sS -o /tmp/soteria-oidc-write.json -w "%{http_code}" -X POST \
-H "X-Vault-Token: ${vault_token}" \
-H 'Content-Type: application/json' \
-d "${payload}" "${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc")"
if [ "${write_status}" != "200" ] && [ "${write_status}" != "204" ]; then
echo "Vault write failed (status ${write_status})" >&2
cat /tmp/soteria-oidc-write.json >&2 || true
exit 1
fi
verify_status="$(curl -sS -o /tmp/soteria-oidc-verify.json -w "%{http_code}" \
-H "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc" || true)"
if [ "${verify_status}" != "200" ]; then
echo "Vault verify failed (status ${verify_status})" >&2
cat /tmp/soteria-oidc-verify.json >&2 || true
exit 1
fi
echo "Soteria OIDC secret ready in Vault"

View File

@ -37,6 +37,9 @@ resources:
- node-image-sweeper-serviceaccount.yaml
- node-image-sweeper-daemonset.yaml
- metis-service.yaml
- soteria-ingress.yaml
- soteria-certificate.yaml
- oauth2-proxy-soteria.yaml
- oauth2-proxy-metis.yaml
- metis-certificate.yaml
- metis-ingress.yaml
@ -46,7 +49,7 @@ images:
- name: registry.bstein.dev/bstein/metis
newTag: 0.1.0-9-amd64
- name: registry.bstein.dev/bstein/soteria
newTag: 0.1.0-11 # {"$imagepolicy": "maintenance:soteria:tag"}
newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:soteria:tag"}
configMapGenerator:
- name: disable-k3s-traefik-script
namespace: maintenance

View File

@ -0,0 +1,121 @@
# services/maintenance/oauth2-proxy-soteria.yaml
apiVersion: v1
kind: Service
metadata:
name: oauth2-proxy-soteria
namespace: maintenance
labels:
app: oauth2-proxy-soteria
spec:
ports:
- name: http
port: 80
targetPort: 4180
selector:
app: oauth2-proxy-soteria
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: oauth2-proxy-soteria
namespace: maintenance
labels:
app: oauth2-proxy-soteria
spec:
replicas: 2
selector:
matchLabels:
app: oauth2-proxy-soteria
template:
metadata:
labels:
app: oauth2-proxy-soteria
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "maintenance"
vault.hashicorp.com/agent-inject-secret-oidc-config: "kv/data/atlas/maintenance/soteria-oidc"
vault.hashicorp.com/agent-inject-template-oidc-config: |
{{- with secret "kv/data/atlas/maintenance/soteria-oidc" -}}
client_id = "{{ .Data.data.client_id }}"
client_secret = "{{ .Data.data.client_secret }}"
cookie_secret = "{{ .Data.data.cookie_secret }}"
{{- end -}}
spec:
serviceAccountName: maintenance-vault-sync
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["amd64","arm64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-13","titan-15","titan-17","titan-19"]
containers:
- name: oauth2-proxy
image: quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
imagePullPolicy: IfNotPresent
args:
- --provider=oidc
- --config=/vault/secrets/oidc-config
- --redirect-url=https://backup.bstein.dev/oauth2/callback
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
- --scope=openid profile email groups
- --email-domain=*
- --allowed-group=admin
- --allowed-group=/admin
- --allowed-group=maintenance
- --allowed-group=/maintenance
- --set-xauthrequest=true
- --pass-access-token=true
- --set-authorization-header=true
- --cookie-secure=true
- --cookie-samesite=lax
- --cookie-refresh=20m
- --cookie-expire=168h
- --insecure-oidc-allow-unverified-email=true
- --upstream=http://soteria.maintenance.svc.cluster.local
- --http-address=0.0.0.0:4180
- --skip-provider-button=true
- --approval-prompt=auto
- --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups
- --cookie-domain=backup.bstein.dev
ports:
- containerPort: 4180
name: http
readinessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /ping
port: 4180
initialDelaySeconds: 20
periodSeconds: 20
resources:
requests:
cpu: 25m
memory: 64Mi
limits:
cpu: 250m
memory: 256Mi

View File

@ -0,0 +1,13 @@
# services/maintenance/soteria-certificate.yaml
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: backup-tls
namespace: maintenance
spec:
secretName: backup-tls
issuerRef:
kind: ClusterIssuer
name: letsencrypt
dnsNames:
- backup.bstein.dev

View File

@ -0,0 +1,14 @@
# services/maintenance/soteria-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: soteria
namespace: maintenance
data:
SOTERIA_BACKUP_DRIVER: longhorn
SOTERIA_LONGHORN_URL: http://longhorn-backend.longhorn-system.svc:9500
SOTERIA_LONGHORN_BACKUP_MODE: incremental
SOTERIA_AUTH_REQUIRED: "true"
SOTERIA_ALLOWED_GROUPS: admin,maintenance
SOTERIA_BACKUP_MAX_AGE_HOURS: "24"
SOTERIA_METRICS_REFRESH_SECONDS: "300"

View File

@ -0,0 +1,76 @@
# services/maintenance/soteria-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: soteria
namespace: maintenance
labels:
app: soteria
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: soteria
template:
metadata:
labels:
app: soteria
spec:
serviceAccountName: soteria
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
containers:
- name: soteria
image: registry.bstein.dev/bstein/soteria:0.1.0-21
imagePullPolicy: Always
envFrom:
- configMapRef:
name: soteria
ports:
- name: http
containerPort: 8080
livenessProbe:
httpGet:
path: /healthz
port: http
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 2
readinessProbe:
httpGet:
path: /readyz
port: http
initialDelaySeconds: 2
periodSeconds: 5
timeoutSeconds: 2
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
runAsNonRoot: true
runAsUser: 65532

View File

@ -0,0 +1,27 @@
# services/maintenance/soteria-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: soteria
namespace: maintenance
annotations:
kubernetes.io/ingress.class: traefik
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.middlewares: ""
spec:
ingressClassName: traefik
tls:
- hosts: ["backup.bstein.dev"]
secretName: backup-tls
rules:
- host: backup.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: oauth2-proxy-soteria
port:
number: 80

View File

@ -0,0 +1,21 @@
# services/maintenance/soteria-service.yaml
apiVersion: v1
kind: Service
metadata:
name: soteria
namespace: maintenance
labels:
app: soteria
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "80"
prometheus.io/path: "/metrics"
spec:
type: ClusterIP
selector:
app: soteria
ports:
- name: http
port: 80
targetPort: http

View File

@ -1970,7 +1970,7 @@
},
"targets": [
{
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
@ -2034,7 +2034,7 @@
"targetBlank": true
}
],
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
"description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
},
{
"id": 30,

View File

@ -1979,7 +1979,7 @@ data:
},
"targets": [
{
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
@ -2043,7 +2043,7 @@ data:
"targetBlank": true
}
],
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
"description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
},
{
"id": 30,