Compare commits
5 Commits
8331411b93
...
82cab1ce2a
| Author | SHA1 | Date | |
|---|---|---|---|
| 82cab1ce2a | |||
| c325744540 | |||
| 241a405c05 | |||
| 6a44a56c38 | |||
| 091e743d0e |
@ -475,7 +475,7 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
|
|||||||
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
|
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
|
||||||
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
|
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
|
||||||
)
|
)
|
||||||
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))"
|
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))"
|
||||||
ANANKE_SELECTOR = 'job="ananke-power"'
|
ANANKE_SELECTOR = 'job="ananke-power"'
|
||||||
ANANKE_UPS_DB_NAME = "Pyrphoros"
|
ANANKE_UPS_DB_NAME = "Pyrphoros"
|
||||||
ANANKE_UPS_DB_NODE = "titan-db"
|
ANANKE_UPS_DB_NODE = "titan-db"
|
||||||
@ -1566,7 +1566,7 @@ def build_overview():
|
|||||||
)
|
)
|
||||||
panels[-1]["links"] = link_to("atlas-storage")
|
panels[-1]["links"] = link_to("atlas-storage")
|
||||||
panels[-1]["description"] = (
|
panels[-1]["description"] = (
|
||||||
"Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
|
"Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
|
||||||
)
|
)
|
||||||
|
|
||||||
panels.append(
|
panels.append(
|
||||||
|
|||||||
@ -23,6 +23,7 @@ resources:
|
|||||||
- oneoffs/synapse-oidc-secret-ensure-job.yaml
|
- oneoffs/synapse-oidc-secret-ensure-job.yaml
|
||||||
- oneoffs/logs-oidc-secret-ensure-job.yaml
|
- oneoffs/logs-oidc-secret-ensure-job.yaml
|
||||||
- oneoffs/metis-oidc-secret-ensure-job.yaml
|
- oneoffs/metis-oidc-secret-ensure-job.yaml
|
||||||
|
- oneoffs/soteria-oidc-secret-ensure-job.yaml
|
||||||
- oneoffs/metis-ssh-keys-secret-ensure-job.yaml
|
- oneoffs/metis-ssh-keys-secret-ensure-job.yaml
|
||||||
- oneoffs/harbor-oidc-secret-ensure-job.yaml
|
- oneoffs/harbor-oidc-secret-ensure-job.yaml
|
||||||
- oneoffs/vault-oidc-secret-ensure-job.yaml
|
- oneoffs/vault-oidc-secret-ensure-job.yaml
|
||||||
|
|||||||
198
services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml
Normal file
198
services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml
Normal file
@ -0,0 +1,198 @@
|
|||||||
|
# services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml
|
||||||
|
# One-off job for sso/soteria-oidc-secret-ensure-1.
|
||||||
|
# Purpose: ensure the Soteria oauth2-proxy OIDC client and Vault secret exist.
|
||||||
|
# Keep this completed Job around; bump the suffix if it ever needs to be rerun.
|
||||||
|
apiVersion: batch/v1
|
||||||
|
kind: Job
|
||||||
|
metadata:
|
||||||
|
name: soteria-oidc-secret-ensure-1
|
||||||
|
namespace: sso
|
||||||
|
spec:
|
||||||
|
backoffLimit: 0
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
annotations:
|
||||||
|
vault.hashicorp.com/agent-inject: "true"
|
||||||
|
vault.hashicorp.com/agent-pre-populate-only: "true"
|
||||||
|
vault.hashicorp.com/role: "sso-secrets"
|
||||||
|
vault.hashicorp.com/agent-inject-secret-keycloak-admin-env.sh: "kv/data/atlas/shared/keycloak-admin"
|
||||||
|
vault.hashicorp.com/agent-inject-template-keycloak-admin-env.sh: |
|
||||||
|
{{ with secret "kv/data/atlas/shared/keycloak-admin" }}
|
||||||
|
export KEYCLOAK_ADMIN="{{ .Data.data.username }}"
|
||||||
|
export KEYCLOAK_ADMIN_USER="{{ .Data.data.username }}"
|
||||||
|
export KEYCLOAK_ADMIN_PASSWORD="{{ .Data.data.password }}"
|
||||||
|
{{ end }}
|
||||||
|
spec:
|
||||||
|
serviceAccountName: mas-secrets-ensure
|
||||||
|
restartPolicy: Never
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: node-role.kubernetes.io/worker
|
||||||
|
operator: Exists
|
||||||
|
preferredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
- weight: 100
|
||||||
|
preference:
|
||||||
|
matchExpressions:
|
||||||
|
- key: kubernetes.io/arch
|
||||||
|
operator: In
|
||||||
|
values: ["arm64"]
|
||||||
|
containers:
|
||||||
|
- name: apply
|
||||||
|
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
||||||
|
command: ["/bin/sh", "-c"]
|
||||||
|
args:
|
||||||
|
- |
|
||||||
|
set -euo pipefail
|
||||||
|
. /vault/secrets/keycloak-admin-env.sh
|
||||||
|
KC_URL="http://keycloak.sso.svc.cluster.local"
|
||||||
|
ACCESS_TOKEN=""
|
||||||
|
for attempt in 1 2 3 4 5; do
|
||||||
|
TOKEN_JSON="$(curl -sS -X POST "$KC_URL/realms/master/protocol/openid-connect/token" \
|
||||||
|
-H 'Content-Type: application/x-www-form-urlencoded' \
|
||||||
|
-d "grant_type=password" \
|
||||||
|
-d "client_id=admin-cli" \
|
||||||
|
-d "username=${KEYCLOAK_ADMIN}" \
|
||||||
|
-d "password=${KEYCLOAK_ADMIN_PASSWORD}" || true)"
|
||||||
|
ACCESS_TOKEN="$(echo "$TOKEN_JSON" | jq -r '.access_token' 2>/dev/null || true)"
|
||||||
|
if [ -n "$ACCESS_TOKEN" ] && [ "$ACCESS_TOKEN" != "null" ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
echo "Keycloak token request failed (attempt ${attempt})" >&2
|
||||||
|
sleep $((attempt * 2))
|
||||||
|
done
|
||||||
|
if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then
|
||||||
|
echo "Failed to fetch Keycloak admin token" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
||||||
|
"$KC_URL/admin/realms/atlas/clients?clientId=soteria" || true)"
|
||||||
|
CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
|
||||||
|
|
||||||
|
if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
|
||||||
|
create_payload='{"clientId":"soteria","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://backup.bstein.dev/oauth2/callback"],"webOrigins":["https://backup.bstein.dev"],"rootUrl":"https://backup.bstein.dev","baseUrl":"/"}'
|
||||||
|
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
|
||||||
|
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d "${create_payload}" \
|
||||||
|
"$KC_URL/admin/realms/atlas/clients")"
|
||||||
|
if [ "$status" != "201" ] && [ "$status" != "204" ] && [ "$status" != "409" ]; then
|
||||||
|
echo "Keycloak client create failed (status ${status})" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
||||||
|
"$KC_URL/admin/realms/atlas/clients?clientId=soteria" || true)"
|
||||||
|
CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
|
||||||
|
echo "Keycloak client soteria not found" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
||||||
|
"$KC_URL/admin/realms/atlas/client-scopes?search=groups" | jq -r '.[] | select(.name=="groups") | .id' 2>/dev/null | head -n1 || true)"
|
||||||
|
if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then
|
||||||
|
echo "Keycloak client scope groups not found" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
DEFAULT_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
||||||
|
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/default-client-scopes" || true)"
|
||||||
|
OPTIONAL_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
||||||
|
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes" || true)"
|
||||||
|
|
||||||
|
if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1 \
|
||||||
|
&& ! echo "$OPTIONAL_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1; then
|
||||||
|
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
|
||||||
|
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
||||||
|
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")"
|
||||||
|
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
|
||||||
|
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
|
||||||
|
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
||||||
|
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")"
|
||||||
|
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
|
||||||
|
echo "Failed to attach groups client scope to soteria (status ${status})" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
update_payload='{"enabled":true,"clientId":"soteria","protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://backup.bstein.dev/oauth2/callback"],"webOrigins":["https://backup.bstein.dev"],"rootUrl":"https://backup.bstein.dev","baseUrl":"/"}'
|
||||||
|
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
|
||||||
|
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d "${update_payload}" \
|
||||||
|
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}")"
|
||||||
|
if [ "$status" != "204" ]; then
|
||||||
|
echo "Keycloak client update failed (status ${status})" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
||||||
|
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)"
|
||||||
|
if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then
|
||||||
|
echo "Keycloak client secret not found" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}"
|
||||||
|
vault_role="${VAULT_ROLE:-sso-secrets}"
|
||||||
|
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
|
||||||
|
login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')"
|
||||||
|
vault_token="$(curl -sS --request POST --data "${login_payload}" \
|
||||||
|
"${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')"
|
||||||
|
if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then
|
||||||
|
echo "vault login failed" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
read_status="$(curl -sS -o /tmp/soteria-oidc-read.json -w "%{http_code}" \
|
||||||
|
-H "X-Vault-Token: ${vault_token}" \
|
||||||
|
"${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc" || true)"
|
||||||
|
COOKIE_SECRET=""
|
||||||
|
if [ "${read_status}" = "200" ]; then
|
||||||
|
COOKIE_SECRET="$(jq -r '.data.data.cookie_secret // empty' /tmp/soteria-oidc-read.json)"
|
||||||
|
elif [ "${read_status}" != "404" ]; then
|
||||||
|
echo "Vault read failed (status ${read_status})" >&2
|
||||||
|
cat /tmp/soteria-oidc-read.json >&2 || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ -n "${COOKIE_SECRET}" ]; then
|
||||||
|
length="$(printf '%s' "${COOKIE_SECRET}" | wc -c | tr -d ' ')"
|
||||||
|
if [ "${length}" != "16" ] && [ "${length}" != "24" ] && [ "${length}" != "32" ]; then
|
||||||
|
COOKIE_SECRET=""
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if [ -z "${COOKIE_SECRET}" ]; then
|
||||||
|
COOKIE_SECRET="$(openssl rand -hex 16 | tr -d '\n')"
|
||||||
|
fi
|
||||||
|
|
||||||
|
payload="$(jq -nc \
|
||||||
|
--arg client_id "soteria" \
|
||||||
|
--arg client_secret "${CLIENT_SECRET}" \
|
||||||
|
--arg cookie_secret "${COOKIE_SECRET}" \
|
||||||
|
'{data:{client_id:$client_id,client_secret:$client_secret,cookie_secret:$cookie_secret}}')"
|
||||||
|
write_status="$(curl -sS -o /tmp/soteria-oidc-write.json -w "%{http_code}" -X POST \
|
||||||
|
-H "X-Vault-Token: ${vault_token}" \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d "${payload}" "${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc")"
|
||||||
|
if [ "${write_status}" != "200" ] && [ "${write_status}" != "204" ]; then
|
||||||
|
echo "Vault write failed (status ${write_status})" >&2
|
||||||
|
cat /tmp/soteria-oidc-write.json >&2 || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
verify_status="$(curl -sS -o /tmp/soteria-oidc-verify.json -w "%{http_code}" \
|
||||||
|
-H "X-Vault-Token: ${vault_token}" \
|
||||||
|
"${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc" || true)"
|
||||||
|
if [ "${verify_status}" != "200" ]; then
|
||||||
|
echo "Vault verify failed (status ${verify_status})" >&2
|
||||||
|
cat /tmp/soteria-oidc-verify.json >&2 || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Soteria OIDC secret ready in Vault"
|
||||||
60
services/maintenance/NOTES.md
Normal file
60
services/maintenance/NOTES.md
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
# Soteria PVC Restore Drill (backup.bstein.dev)
|
||||||
|
|
||||||
|
Use this checklist after meaningful Soteria backup, restore, auth, or alerting changes.
|
||||||
|
|
||||||
|
## Production Restore Drill Checklist
|
||||||
|
|
||||||
|
1. Verify baseline health before touching restores.
|
||||||
|
- `flux get kustomizations -n flux-system maintenance`
|
||||||
|
- `kubectl -n maintenance get deploy soteria oauth2-proxy-soteria`
|
||||||
|
2. Confirm operator access and source safety.
|
||||||
|
- Operator must be in Keycloak group `admin` or `maintenance`.
|
||||||
|
- Choose a real source PVC that is expected to be backed up, not a throwaway test PVC.
|
||||||
|
3. Run the UI flow at `https://backup.bstein.dev`.
|
||||||
|
- Sign in via Keycloak.
|
||||||
|
- In `PVC Inventory`, select source namespace and PVC.
|
||||||
|
- Click `Backup now` and wait for success in `Last Action`.
|
||||||
|
- Click `Restore` and pick a completed snapshot.
|
||||||
|
- Set `Target namespace` and unique `Target PVC name` (`restore-<source-pvc>-<date>`).
|
||||||
|
- Click `Create restore PVC`.
|
||||||
|
4. Validate restore output.
|
||||||
|
- `kubectl -n <target-namespace> get pvc <target-pvc>`
|
||||||
|
- If workload-level validation is required, attach a temporary pod and inspect expected files/data.
|
||||||
|
5. Clean up.
|
||||||
|
- `kubectl -n <target-namespace> delete pvc <target-pvc>`
|
||||||
|
- Remove detached restore Longhorn volume from Longhorn UI/API if one remains.
|
||||||
|
|
||||||
|
## Alert Query Verification (`maint-soteria-*`)
|
||||||
|
|
||||||
|
Start a local query endpoint:
|
||||||
|
|
||||||
|
`kubectl -n monitoring port-forward svc/victoria-metrics-k8s-stack 8428:8428`
|
||||||
|
|
||||||
|
Validate each alert expression directly.
|
||||||
|
|
||||||
|
1. `maint-soteria-refresh-stale` (`time() - soteria_inventory_refresh_timestamp_seconds`, threshold `> 900`).
|
||||||
|
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=time() - soteria_inventory_refresh_timestamp_seconds'`
|
||||||
|
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=(time() - soteria_inventory_refresh_timestamp_seconds) > bool 900'`
|
||||||
|
- Healthy expectation: age is below `900` and threshold query returns `0`.
|
||||||
|
2. `maint-soteria-backup-unhealthy` (`sum((1 - pvc_backup_health{driver="longhorn"}) > bool 0) or on() vector(0)`, threshold `> 0`).
|
||||||
|
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=sum((1 - pvc_backup_health{driver="longhorn"}) > bool 0) or on() vector(0)'`
|
||||||
|
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=(1 - pvc_backup_health{driver="longhorn"}) > bool 0'`
|
||||||
|
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=max by (namespace,pvc) (pvc_backup_age_hours{driver="longhorn"})'`
|
||||||
|
- Healthy expectation: unhealthy count is `0`; no series should be `1` in the per-PVC unhealthy query.
|
||||||
|
3. `maint-soteria-authz-denials` (`sum(increase(soteria_authz_denials_total[15m])) or on() vector(0)`, threshold `> 9` for 10m).
|
||||||
|
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=sum(increase(soteria_authz_denials_total[15m])) or on() vector(0)'`
|
||||||
|
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=sum by (reason) (increase(soteria_authz_denials_total[15m]))'`
|
||||||
|
- Healthy expectation: total remains below `10` in normal operation; spikes should map to expected `reason` labels.
|
||||||
|
|
||||||
|
## Failure Triage
|
||||||
|
|
||||||
|
- `401/403` on UI or API:
|
||||||
|
- Verify oauth2-proxy group claims include `admin` or `maintenance`.
|
||||||
|
- Restore conflict:
|
||||||
|
- Target PVC already exists; choose a new target PVC name.
|
||||||
|
- `maint-soteria-refresh-stale` firing:
|
||||||
|
- Check Soteria pod health and `/metrics` scrape reachability from `monitoring`.
|
||||||
|
- `maint-soteria-backup-unhealthy` firing:
|
||||||
|
- Inspect `pvc_backup_health` and `pvc_backup_age_hours` to identify stale or missing backups.
|
||||||
|
- `maint-soteria-authz-denials` firing:
|
||||||
|
- Confirm expected OIDC groups and inspect denial `reason` labels for policy or header regressions.
|
||||||
@ -35,6 +35,11 @@ resources:
|
|||||||
- node-image-sweeper-daemonset.yaml
|
- node-image-sweeper-daemonset.yaml
|
||||||
- image-sweeper-cronjob.yaml
|
- image-sweeper-cronjob.yaml
|
||||||
- metis-service.yaml
|
- metis-service.yaml
|
||||||
|
- soteria-networkpolicy.yaml
|
||||||
|
- oauth2-proxy-soteria-networkpolicy.yaml
|
||||||
|
- soteria-ingress.yaml
|
||||||
|
- soteria-certificate.yaml
|
||||||
|
- oauth2-proxy-soteria.yaml
|
||||||
- oauth2-proxy-metis.yaml
|
- oauth2-proxy-metis.yaml
|
||||||
- metis-certificate.yaml
|
- metis-certificate.yaml
|
||||||
- metis-ingress.yaml
|
- metis-ingress.yaml
|
||||||
@ -43,6 +48,8 @@ images:
|
|||||||
newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
|
newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
|
||||||
- name: registry.bstein.dev/bstein/metis
|
- name: registry.bstein.dev/bstein/metis
|
||||||
newTag: 0.1.0-9-amd64
|
newTag: 0.1.0-9-amd64
|
||||||
|
- name: registry.bstein.dev/bstein/soteria
|
||||||
|
newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:soteria:tag"}
|
||||||
configMapGenerator:
|
configMapGenerator:
|
||||||
- name: disable-k3s-traefik-script
|
- name: disable-k3s-traefik-script
|
||||||
namespace: maintenance
|
namespace: maintenance
|
||||||
|
|||||||
23
services/maintenance/oauth2-proxy-soteria-networkpolicy.yaml
Normal file
23
services/maintenance/oauth2-proxy-soteria-networkpolicy.yaml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# services/maintenance/oauth2-proxy-soteria-networkpolicy.yaml
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: oauth2-proxy-soteria-ingress
|
||||||
|
namespace: maintenance
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: oauth2-proxy-soteria
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
ingress:
|
||||||
|
- from:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
kubernetes.io/metadata.name: traefik
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: traefik
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 4180
|
||||||
120
services/maintenance/oauth2-proxy-soteria.yaml
Normal file
120
services/maintenance/oauth2-proxy-soteria.yaml
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
# services/maintenance/oauth2-proxy-soteria.yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: oauth2-proxy-soteria
|
||||||
|
namespace: maintenance
|
||||||
|
labels:
|
||||||
|
app: oauth2-proxy-soteria
|
||||||
|
spec:
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 80
|
||||||
|
targetPort: 4180
|
||||||
|
selector:
|
||||||
|
app: oauth2-proxy-soteria
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: oauth2-proxy-soteria
|
||||||
|
namespace: maintenance
|
||||||
|
labels:
|
||||||
|
app: oauth2-proxy-soteria
|
||||||
|
spec:
|
||||||
|
replicas: 2
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: oauth2-proxy-soteria
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: oauth2-proxy-soteria
|
||||||
|
annotations:
|
||||||
|
vault.hashicorp.com/agent-inject: "true"
|
||||||
|
vault.hashicorp.com/role: "maintenance"
|
||||||
|
vault.hashicorp.com/agent-inject-secret-oidc-config: "kv/data/atlas/maintenance/soteria-oidc"
|
||||||
|
vault.hashicorp.com/agent-inject-template-oidc-config: |
|
||||||
|
{{- with secret "kv/data/atlas/maintenance/soteria-oidc" -}}
|
||||||
|
client_id = "{{ .Data.data.client_id }}"
|
||||||
|
client_secret = "{{ .Data.data.client_secret }}"
|
||||||
|
cookie_secret = "{{ .Data.data.cookie_secret }}"
|
||||||
|
{{- end -}}
|
||||||
|
spec:
|
||||||
|
serviceAccountName: maintenance-vault-sync
|
||||||
|
nodeSelector:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: kubernetes.io/arch
|
||||||
|
operator: In
|
||||||
|
values: ["amd64","arm64"]
|
||||||
|
preferredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
- weight: 100
|
||||||
|
preference:
|
||||||
|
matchExpressions:
|
||||||
|
- key: hardware
|
||||||
|
operator: In
|
||||||
|
values: ["rpi5"]
|
||||||
|
- weight: 100
|
||||||
|
preference:
|
||||||
|
matchExpressions:
|
||||||
|
- key: kubernetes.io/hostname
|
||||||
|
operator: NotIn
|
||||||
|
values: ["titan-13","titan-15","titan-17","titan-19"]
|
||||||
|
containers:
|
||||||
|
- name: oauth2-proxy
|
||||||
|
image: quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
args:
|
||||||
|
- --provider=oidc
|
||||||
|
- --config=/vault/secrets/oidc-config
|
||||||
|
- --redirect-url=https://backup.bstein.dev/oauth2/callback
|
||||||
|
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
|
||||||
|
- --scope=openid profile email groups
|
||||||
|
- --email-domain=*
|
||||||
|
- --allowed-group=admin
|
||||||
|
- --allowed-group=/admin
|
||||||
|
- --allowed-group=maintenance
|
||||||
|
- --allowed-group=/maintenance
|
||||||
|
- --set-xauthrequest=true
|
||||||
|
- --pass-user-headers=true
|
||||||
|
- --cookie-secure=true
|
||||||
|
- --cookie-samesite=lax
|
||||||
|
- --cookie-refresh=20m
|
||||||
|
- --cookie-expire=168h
|
||||||
|
- --insecure-oidc-allow-unverified-email=true
|
||||||
|
- --upstream=http://soteria.maintenance.svc.cluster.local
|
||||||
|
- --http-address=0.0.0.0:4180
|
||||||
|
- --skip-provider-button=true
|
||||||
|
- --approval-prompt=auto
|
||||||
|
- --skip-jwt-bearer-tokens=true
|
||||||
|
- --oidc-groups-claim=groups
|
||||||
|
- --cookie-domain=backup.bstein.dev
|
||||||
|
ports:
|
||||||
|
- containerPort: 4180
|
||||||
|
name: http
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /ping
|
||||||
|
port: 4180
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /ping
|
||||||
|
port: 4180
|
||||||
|
initialDelaySeconds: 20
|
||||||
|
periodSeconds: 20
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 25m
|
||||||
|
memory: 64Mi
|
||||||
|
limits:
|
||||||
|
cpu: 250m
|
||||||
|
memory: 256Mi
|
||||||
13
services/maintenance/soteria-certificate.yaml
Normal file
13
services/maintenance/soteria-certificate.yaml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# services/maintenance/soteria-certificate.yaml
|
||||||
|
apiVersion: cert-manager.io/v1
|
||||||
|
kind: Certificate
|
||||||
|
metadata:
|
||||||
|
name: backup-tls
|
||||||
|
namespace: maintenance
|
||||||
|
spec:
|
||||||
|
secretName: backup-tls
|
||||||
|
issuerRef:
|
||||||
|
kind: ClusterIssuer
|
||||||
|
name: letsencrypt
|
||||||
|
dnsNames:
|
||||||
|
- backup.bstein.dev
|
||||||
14
services/maintenance/soteria-configmap.yaml
Normal file
14
services/maintenance/soteria-configmap.yaml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# services/maintenance/soteria-configmap.yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: soteria
|
||||||
|
namespace: maintenance
|
||||||
|
data:
|
||||||
|
SOTERIA_BACKUP_DRIVER: longhorn
|
||||||
|
SOTERIA_LONGHORN_URL: http://longhorn-backend.longhorn-system.svc:9500
|
||||||
|
SOTERIA_LONGHORN_BACKUP_MODE: incremental
|
||||||
|
SOTERIA_AUTH_REQUIRED: "true"
|
||||||
|
SOTERIA_ALLOWED_GROUPS: admin,maintenance
|
||||||
|
SOTERIA_BACKUP_MAX_AGE_HOURS: "24"
|
||||||
|
SOTERIA_METRICS_REFRESH_SECONDS: "300"
|
||||||
76
services/maintenance/soteria-deployment.yaml
Normal file
76
services/maintenance/soteria-deployment.yaml
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
# services/maintenance/soteria-deployment.yaml
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: soteria
|
||||||
|
namespace: maintenance
|
||||||
|
labels:
|
||||||
|
app: soteria
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
revisionHistoryLimit: 3
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: soteria
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: soteria
|
||||||
|
spec:
|
||||||
|
serviceAccountName: soteria
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/arch: arm64
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
preferredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
- weight: 90
|
||||||
|
preference:
|
||||||
|
matchExpressions:
|
||||||
|
- key: hardware
|
||||||
|
operator: In
|
||||||
|
values: ["rpi5"]
|
||||||
|
- weight: 50
|
||||||
|
preference:
|
||||||
|
matchExpressions:
|
||||||
|
- key: hardware
|
||||||
|
operator: In
|
||||||
|
values: ["rpi4"]
|
||||||
|
containers:
|
||||||
|
- name: soteria
|
||||||
|
image: registry.bstein.dev/bstein/soteria:0.1.0-21
|
||||||
|
imagePullPolicy: Always
|
||||||
|
envFrom:
|
||||||
|
- configMapRef:
|
||||||
|
name: soteria
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
containerPort: 8080
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /healthz
|
||||||
|
port: http
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 2
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /readyz
|
||||||
|
port: http
|
||||||
|
initialDelaySeconds: 2
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 2
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 50m
|
||||||
|
memory: 64Mi
|
||||||
|
limits:
|
||||||
|
cpu: 200m
|
||||||
|
memory: 256Mi
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 65532
|
||||||
|
|
||||||
27
services/maintenance/soteria-ingress.yaml
Normal file
27
services/maintenance/soteria-ingress.yaml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# services/maintenance/soteria-ingress.yaml
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: soteria
|
||||||
|
namespace: maintenance
|
||||||
|
annotations:
|
||||||
|
kubernetes.io/ingress.class: traefik
|
||||||
|
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
||||||
|
traefik.ingress.kubernetes.io/router.tls: "true"
|
||||||
|
traefik.ingress.kubernetes.io/router.middlewares: ""
|
||||||
|
spec:
|
||||||
|
ingressClassName: traefik
|
||||||
|
tls:
|
||||||
|
- hosts: ["backup.bstein.dev"]
|
||||||
|
secretName: backup-tls
|
||||||
|
rules:
|
||||||
|
- host: backup.bstein.dev
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
- path: /
|
||||||
|
pathType: Prefix
|
||||||
|
backend:
|
||||||
|
service:
|
||||||
|
name: oauth2-proxy-soteria
|
||||||
|
port:
|
||||||
|
number: 80
|
||||||
27
services/maintenance/soteria-networkpolicy.yaml
Normal file
27
services/maintenance/soteria-networkpolicy.yaml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# services/maintenance/soteria-networkpolicy.yaml
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: soteria-ingress
|
||||||
|
namespace: maintenance
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: soteria
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
ingress:
|
||||||
|
- from:
|
||||||
|
- podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: oauth2-proxy-soteria
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 8080
|
||||||
|
- from:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
kubernetes.io/metadata.name: monitoring
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 8080
|
||||||
29
services/maintenance/soteria-rbac.yaml
Normal file
29
services/maintenance/soteria-rbac.yaml
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
# services/maintenance/soteria-rbac.yaml
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
name: soteria
|
||||||
|
rules:
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["persistentvolumeclaims", "persistentvolumes"]
|
||||||
|
verbs: ["get", "list"]
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["secrets"]
|
||||||
|
verbs: ["get", "list", "create", "update", "delete"]
|
||||||
|
- apiGroups: ["batch"]
|
||||||
|
resources: ["jobs"]
|
||||||
|
verbs: ["get", "list", "create", "delete"]
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: soteria
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: soteria
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: soteria
|
||||||
|
namespace: maintenance
|
||||||
|
|
||||||
21
services/maintenance/soteria-service.yaml
Normal file
21
services/maintenance/soteria-service.yaml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
# services/maintenance/soteria-service.yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: soteria
|
||||||
|
namespace: maintenance
|
||||||
|
labels:
|
||||||
|
app: soteria
|
||||||
|
annotations:
|
||||||
|
prometheus.io/scrape: "true"
|
||||||
|
prometheus.io/port: "80"
|
||||||
|
prometheus.io/path: "/metrics"
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
selector:
|
||||||
|
app: soteria
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 80
|
||||||
|
targetPort: http
|
||||||
|
|
||||||
9
services/maintenance/soteria-serviceaccount.yaml
Normal file
9
services/maintenance/soteria-serviceaccount.yaml
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# services/maintenance/soteria-serviceaccount.yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: soteria
|
||||||
|
namespace: maintenance
|
||||||
|
imagePullSecrets:
|
||||||
|
- name: harbor-regcred
|
||||||
|
|
||||||
@ -1970,7 +1970,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
|
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}/{{pvc}}",
|
"legendFormat": "{{namespace}}/{{pvc}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -2034,7 +2034,7 @@
|
|||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
|
"description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 30,
|
"id": 30,
|
||||||
|
|||||||
@ -447,6 +447,150 @@ data:
|
|||||||
summary: "Legacy cronjob alert disabled"
|
summary: "Legacy cronjob alert disabled"
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
|
- uid: maint-soteria-refresh-stale
|
||||||
|
title: "Soteria inventory refresh stale (>15m)"
|
||||||
|
condition: C
|
||||||
|
for: "15m"
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 900
|
||||||
|
to: 0
|
||||||
|
datasourceUid: atlas-vm
|
||||||
|
model:
|
||||||
|
expr: time() - soteria_inventory_refresh_timestamp_seconds
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
legendFormat: soteria-refresh-age-seconds
|
||||||
|
datasource:
|
||||||
|
type: prometheus
|
||||||
|
uid: atlas-vm
|
||||||
|
- refId: B
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: A
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
reducer: last
|
||||||
|
type: reduce
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: B
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
type: threshold
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params: [900]
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
reducer:
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
noDataState: Alerting
|
||||||
|
execErrState: Alerting
|
||||||
|
annotations:
|
||||||
|
summary: "Soteria inventory telemetry has not refreshed in >15m"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- uid: maint-soteria-backup-unhealthy
|
||||||
|
title: "Soteria reports unhealthy PVC backups"
|
||||||
|
condition: C
|
||||||
|
for: "10m"
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
datasourceUid: atlas-vm
|
||||||
|
model:
|
||||||
|
expr: sum((1 - pvc_backup_health{driver="longhorn"}) > bool 0) or on() vector(0)
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
legendFormat: unhealthy-pvcs
|
||||||
|
datasource:
|
||||||
|
type: prometheus
|
||||||
|
uid: atlas-vm
|
||||||
|
- refId: B
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: A
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
reducer: last
|
||||||
|
type: reduce
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: B
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
type: threshold
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params: [0]
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
reducer:
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Alerting
|
||||||
|
annotations:
|
||||||
|
summary: "One or more PVCs are stale, missing, or failed per Soteria backup health"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- uid: maint-soteria-authz-denials
|
||||||
|
title: "Soteria authorization denials elevated"
|
||||||
|
condition: C
|
||||||
|
for: "10m"
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 900
|
||||||
|
to: 0
|
||||||
|
datasourceUid: atlas-vm
|
||||||
|
model:
|
||||||
|
expr: sum(increase(soteria_authz_denials_total[15m])) or on() vector(0)
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
legendFormat: soteria-authz-denials-15m
|
||||||
|
datasource:
|
||||||
|
type: prometheus
|
||||||
|
uid: atlas-vm
|
||||||
|
- refId: B
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: A
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
reducer: last
|
||||||
|
type: reduce
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: B
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
type: threshold
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params: [10]
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
reducer:
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Alerting
|
||||||
|
annotations:
|
||||||
|
summary: "Soteria saw >10 authorization denials in 15m"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
name: ariadne
|
name: ariadne
|
||||||
folder: Alerts
|
folder: Alerts
|
||||||
|
|||||||
@ -1979,7 +1979,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
|
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}/{{pvc}}",
|
"legendFormat": "{{namespace}}/{{pvc}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -2043,7 +2043,7 @@ data:
|
|||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
|
"description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 30,
|
"id": 30,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user