Compare commits
No commits in common. "82cab1ce2ac99bf1b6557b5dfa37f05c5d33b56d" and "8331411b93958db13680bda32cf473d2bf18b15b" have entirely different histories.
82cab1ce2a
...
8331411b93
@ -475,7 +475,7 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
|
|||||||
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
|
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
|
||||||
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
|
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
|
||||||
)
|
)
|
||||||
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))"
|
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))"
|
||||||
ANANKE_SELECTOR = 'job="ananke-power"'
|
ANANKE_SELECTOR = 'job="ananke-power"'
|
||||||
ANANKE_UPS_DB_NAME = "Pyrphoros"
|
ANANKE_UPS_DB_NAME = "Pyrphoros"
|
||||||
ANANKE_UPS_DB_NODE = "titan-db"
|
ANANKE_UPS_DB_NODE = "titan-db"
|
||||||
@ -1566,7 +1566,7 @@ def build_overview():
|
|||||||
)
|
)
|
||||||
panels[-1]["links"] = link_to("atlas-storage")
|
panels[-1]["links"] = link_to("atlas-storage")
|
||||||
panels[-1]["description"] = (
|
panels[-1]["description"] = (
|
||||||
"Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
|
"Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
|
||||||
)
|
)
|
||||||
|
|
||||||
panels.append(
|
panels.append(
|
||||||
|
|||||||
@ -23,7 +23,6 @@ resources:
|
|||||||
- oneoffs/synapse-oidc-secret-ensure-job.yaml
|
- oneoffs/synapse-oidc-secret-ensure-job.yaml
|
||||||
- oneoffs/logs-oidc-secret-ensure-job.yaml
|
- oneoffs/logs-oidc-secret-ensure-job.yaml
|
||||||
- oneoffs/metis-oidc-secret-ensure-job.yaml
|
- oneoffs/metis-oidc-secret-ensure-job.yaml
|
||||||
- oneoffs/soteria-oidc-secret-ensure-job.yaml
|
|
||||||
- oneoffs/metis-ssh-keys-secret-ensure-job.yaml
|
- oneoffs/metis-ssh-keys-secret-ensure-job.yaml
|
||||||
- oneoffs/harbor-oidc-secret-ensure-job.yaml
|
- oneoffs/harbor-oidc-secret-ensure-job.yaml
|
||||||
- oneoffs/vault-oidc-secret-ensure-job.yaml
|
- oneoffs/vault-oidc-secret-ensure-job.yaml
|
||||||
|
|||||||
@ -1,198 +0,0 @@
|
|||||||
# services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml
|
|
||||||
# One-off job for sso/soteria-oidc-secret-ensure-1.
|
|
||||||
# Purpose: ensure the Soteria oauth2-proxy OIDC client and Vault secret exist.
|
|
||||||
# Keep this completed Job around; bump the suffix if it ever needs to be rerun.
|
|
||||||
apiVersion: batch/v1
|
|
||||||
kind: Job
|
|
||||||
metadata:
|
|
||||||
name: soteria-oidc-secret-ensure-1
|
|
||||||
namespace: sso
|
|
||||||
spec:
|
|
||||||
backoffLimit: 0
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
annotations:
|
|
||||||
vault.hashicorp.com/agent-inject: "true"
|
|
||||||
vault.hashicorp.com/agent-pre-populate-only: "true"
|
|
||||||
vault.hashicorp.com/role: "sso-secrets"
|
|
||||||
vault.hashicorp.com/agent-inject-secret-keycloak-admin-env.sh: "kv/data/atlas/shared/keycloak-admin"
|
|
||||||
vault.hashicorp.com/agent-inject-template-keycloak-admin-env.sh: |
|
|
||||||
{{ with secret "kv/data/atlas/shared/keycloak-admin" }}
|
|
||||||
export KEYCLOAK_ADMIN="{{ .Data.data.username }}"
|
|
||||||
export KEYCLOAK_ADMIN_USER="{{ .Data.data.username }}"
|
|
||||||
export KEYCLOAK_ADMIN_PASSWORD="{{ .Data.data.password }}"
|
|
||||||
{{ end }}
|
|
||||||
spec:
|
|
||||||
serviceAccountName: mas-secrets-ensure
|
|
||||||
restartPolicy: Never
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
- key: node-role.kubernetes.io/worker
|
|
||||||
operator: Exists
|
|
||||||
preferredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
- weight: 100
|
|
||||||
preference:
|
|
||||||
matchExpressions:
|
|
||||||
- key: kubernetes.io/arch
|
|
||||||
operator: In
|
|
||||||
values: ["arm64"]
|
|
||||||
containers:
|
|
||||||
- name: apply
|
|
||||||
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
|
||||||
command: ["/bin/sh", "-c"]
|
|
||||||
args:
|
|
||||||
- |
|
|
||||||
set -euo pipefail
|
|
||||||
. /vault/secrets/keycloak-admin-env.sh
|
|
||||||
KC_URL="http://keycloak.sso.svc.cluster.local"
|
|
||||||
ACCESS_TOKEN=""
|
|
||||||
for attempt in 1 2 3 4 5; do
|
|
||||||
TOKEN_JSON="$(curl -sS -X POST "$KC_URL/realms/master/protocol/openid-connect/token" \
|
|
||||||
-H 'Content-Type: application/x-www-form-urlencoded' \
|
|
||||||
-d "grant_type=password" \
|
|
||||||
-d "client_id=admin-cli" \
|
|
||||||
-d "username=${KEYCLOAK_ADMIN}" \
|
|
||||||
-d "password=${KEYCLOAK_ADMIN_PASSWORD}" || true)"
|
|
||||||
ACCESS_TOKEN="$(echo "$TOKEN_JSON" | jq -r '.access_token' 2>/dev/null || true)"
|
|
||||||
if [ -n "$ACCESS_TOKEN" ] && [ "$ACCESS_TOKEN" != "null" ]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
echo "Keycloak token request failed (attempt ${attempt})" >&2
|
|
||||||
sleep $((attempt * 2))
|
|
||||||
done
|
|
||||||
if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then
|
|
||||||
echo "Failed to fetch Keycloak admin token" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
|
||||||
"$KC_URL/admin/realms/atlas/clients?clientId=soteria" || true)"
|
|
||||||
CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
|
|
||||||
|
|
||||||
if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
|
|
||||||
create_payload='{"clientId":"soteria","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://backup.bstein.dev/oauth2/callback"],"webOrigins":["https://backup.bstein.dev"],"rootUrl":"https://backup.bstein.dev","baseUrl":"/"}'
|
|
||||||
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
|
|
||||||
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
|
||||||
-H 'Content-Type: application/json' \
|
|
||||||
-d "${create_payload}" \
|
|
||||||
"$KC_URL/admin/realms/atlas/clients")"
|
|
||||||
if [ "$status" != "201" ] && [ "$status" != "204" ] && [ "$status" != "409" ]; then
|
|
||||||
echo "Keycloak client create failed (status ${status})" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
|
||||||
"$KC_URL/admin/realms/atlas/clients?clientId=soteria" || true)"
|
|
||||||
CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
|
|
||||||
echo "Keycloak client soteria not found" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
|
||||||
"$KC_URL/admin/realms/atlas/client-scopes?search=groups" | jq -r '.[] | select(.name=="groups") | .id' 2>/dev/null | head -n1 || true)"
|
|
||||||
if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then
|
|
||||||
echo "Keycloak client scope groups not found" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
DEFAULT_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
|
||||||
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/default-client-scopes" || true)"
|
|
||||||
OPTIONAL_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
|
||||||
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes" || true)"
|
|
||||||
|
|
||||||
if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1 \
|
|
||||||
&& ! echo "$OPTIONAL_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1; then
|
|
||||||
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
|
|
||||||
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
|
||||||
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")"
|
|
||||||
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
|
|
||||||
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
|
|
||||||
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
|
||||||
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")"
|
|
||||||
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
|
|
||||||
echo "Failed to attach groups client scope to soteria (status ${status})" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
update_payload='{"enabled":true,"clientId":"soteria","protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://backup.bstein.dev/oauth2/callback"],"webOrigins":["https://backup.bstein.dev"],"rootUrl":"https://backup.bstein.dev","baseUrl":"/"}'
|
|
||||||
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
|
|
||||||
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
|
||||||
-H 'Content-Type: application/json' \
|
|
||||||
-d "${update_payload}" \
|
|
||||||
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}")"
|
|
||||||
if [ "$status" != "204" ]; then
|
|
||||||
echo "Keycloak client update failed (status ${status})" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
|
|
||||||
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)"
|
|
||||||
if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then
|
|
||||||
echo "Keycloak client secret not found" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}"
|
|
||||||
vault_role="${VAULT_ROLE:-sso-secrets}"
|
|
||||||
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
|
|
||||||
login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')"
|
|
||||||
vault_token="$(curl -sS --request POST --data "${login_payload}" \
|
|
||||||
"${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')"
|
|
||||||
if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then
|
|
||||||
echo "vault login failed" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
read_status="$(curl -sS -o /tmp/soteria-oidc-read.json -w "%{http_code}" \
|
|
||||||
-H "X-Vault-Token: ${vault_token}" \
|
|
||||||
"${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc" || true)"
|
|
||||||
COOKIE_SECRET=""
|
|
||||||
if [ "${read_status}" = "200" ]; then
|
|
||||||
COOKIE_SECRET="$(jq -r '.data.data.cookie_secret // empty' /tmp/soteria-oidc-read.json)"
|
|
||||||
elif [ "${read_status}" != "404" ]; then
|
|
||||||
echo "Vault read failed (status ${read_status})" >&2
|
|
||||||
cat /tmp/soteria-oidc-read.json >&2 || true
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
if [ -n "${COOKIE_SECRET}" ]; then
|
|
||||||
length="$(printf '%s' "${COOKIE_SECRET}" | wc -c | tr -d ' ')"
|
|
||||||
if [ "${length}" != "16" ] && [ "${length}" != "24" ] && [ "${length}" != "32" ]; then
|
|
||||||
COOKIE_SECRET=""
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
if [ -z "${COOKIE_SECRET}" ]; then
|
|
||||||
COOKIE_SECRET="$(openssl rand -hex 16 | tr -d '\n')"
|
|
||||||
fi
|
|
||||||
|
|
||||||
payload="$(jq -nc \
|
|
||||||
--arg client_id "soteria" \
|
|
||||||
--arg client_secret "${CLIENT_SECRET}" \
|
|
||||||
--arg cookie_secret "${COOKIE_SECRET}" \
|
|
||||||
'{data:{client_id:$client_id,client_secret:$client_secret,cookie_secret:$cookie_secret}}')"
|
|
||||||
write_status="$(curl -sS -o /tmp/soteria-oidc-write.json -w "%{http_code}" -X POST \
|
|
||||||
-H "X-Vault-Token: ${vault_token}" \
|
|
||||||
-H 'Content-Type: application/json' \
|
|
||||||
-d "${payload}" "${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc")"
|
|
||||||
if [ "${write_status}" != "200" ] && [ "${write_status}" != "204" ]; then
|
|
||||||
echo "Vault write failed (status ${write_status})" >&2
|
|
||||||
cat /tmp/soteria-oidc-write.json >&2 || true
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
verify_status="$(curl -sS -o /tmp/soteria-oidc-verify.json -w "%{http_code}" \
|
|
||||||
-H "X-Vault-Token: ${vault_token}" \
|
|
||||||
"${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc" || true)"
|
|
||||||
if [ "${verify_status}" != "200" ]; then
|
|
||||||
echo "Vault verify failed (status ${verify_status})" >&2
|
|
||||||
cat /tmp/soteria-oidc-verify.json >&2 || true
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Soteria OIDC secret ready in Vault"
|
|
||||||
@ -1,60 +0,0 @@
|
|||||||
# Soteria PVC Restore Drill (backup.bstein.dev)
|
|
||||||
|
|
||||||
Use this checklist after meaningful Soteria backup, restore, auth, or alerting changes.
|
|
||||||
|
|
||||||
## Production Restore Drill Checklist
|
|
||||||
|
|
||||||
1. Verify baseline health before touching restores.
|
|
||||||
- `flux get kustomizations -n flux-system maintenance`
|
|
||||||
- `kubectl -n maintenance get deploy soteria oauth2-proxy-soteria`
|
|
||||||
2. Confirm operator access and source safety.
|
|
||||||
- Operator must be in Keycloak group `admin` or `maintenance`.
|
|
||||||
- Choose a real source PVC that is expected to be backed up, not a throwaway test PVC.
|
|
||||||
3. Run the UI flow at `https://backup.bstein.dev`.
|
|
||||||
- Sign in via Keycloak.
|
|
||||||
- In `PVC Inventory`, select source namespace and PVC.
|
|
||||||
- Click `Backup now` and wait for success in `Last Action`.
|
|
||||||
- Click `Restore` and pick a completed snapshot.
|
|
||||||
- Set `Target namespace` and unique `Target PVC name` (`restore-<source-pvc>-<date>`).
|
|
||||||
- Click `Create restore PVC`.
|
|
||||||
4. Validate restore output.
|
|
||||||
- `kubectl -n <target-namespace> get pvc <target-pvc>`
|
|
||||||
- If workload-level validation is required, attach a temporary pod and inspect expected files/data.
|
|
||||||
5. Clean up.
|
|
||||||
- `kubectl -n <target-namespace> delete pvc <target-pvc>`
|
|
||||||
- Remove detached restore Longhorn volume from Longhorn UI/API if one remains.
|
|
||||||
|
|
||||||
## Alert Query Verification (`maint-soteria-*`)
|
|
||||||
|
|
||||||
Start a local query endpoint:
|
|
||||||
|
|
||||||
`kubectl -n monitoring port-forward svc/victoria-metrics-k8s-stack 8428:8428`
|
|
||||||
|
|
||||||
Validate each alert expression directly.
|
|
||||||
|
|
||||||
1. `maint-soteria-refresh-stale` (`time() - soteria_inventory_refresh_timestamp_seconds`, threshold `> 900`).
|
|
||||||
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=time() - soteria_inventory_refresh_timestamp_seconds'`
|
|
||||||
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=(time() - soteria_inventory_refresh_timestamp_seconds) > bool 900'`
|
|
||||||
- Healthy expectation: age is below `900` and threshold query returns `0`.
|
|
||||||
2. `maint-soteria-backup-unhealthy` (`sum((1 - pvc_backup_health{driver="longhorn"}) > bool 0) or on() vector(0)`, threshold `> 0`).
|
|
||||||
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=sum((1 - pvc_backup_health{driver="longhorn"}) > bool 0) or on() vector(0)'`
|
|
||||||
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=(1 - pvc_backup_health{driver="longhorn"}) > bool 0'`
|
|
||||||
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=max by (namespace,pvc) (pvc_backup_age_hours{driver="longhorn"})'`
|
|
||||||
- Healthy expectation: unhealthy count is `0`; no series should be `1` in the per-PVC unhealthy query.
|
|
||||||
3. `maint-soteria-authz-denials` (`sum(increase(soteria_authz_denials_total[15m])) or on() vector(0)`, threshold `> 9` for 10m).
|
|
||||||
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=sum(increase(soteria_authz_denials_total[15m])) or on() vector(0)'`
|
|
||||||
- `curl -fsS --get 'http://127.0.0.1:8428/api/v1/query' --data-urlencode 'query=sum by (reason) (increase(soteria_authz_denials_total[15m]))'`
|
|
||||||
- Healthy expectation: total remains below `10` in normal operation; spikes should map to expected `reason` labels.
|
|
||||||
|
|
||||||
## Failure Triage
|
|
||||||
|
|
||||||
- `401/403` on UI or API:
|
|
||||||
- Verify oauth2-proxy group claims include `admin` or `maintenance`.
|
|
||||||
- Restore conflict:
|
|
||||||
- Target PVC already exists; choose a new target PVC name.
|
|
||||||
- `maint-soteria-refresh-stale` firing:
|
|
||||||
- Check Soteria pod health and `/metrics` scrape reachability from `monitoring`.
|
|
||||||
- `maint-soteria-backup-unhealthy` firing:
|
|
||||||
- Inspect `pvc_backup_health` and `pvc_backup_age_hours` to identify stale or missing backups.
|
|
||||||
- `maint-soteria-authz-denials` firing:
|
|
||||||
- Confirm expected OIDC groups and inspect denial `reason` labels for policy or header regressions.
|
|
||||||
@ -35,11 +35,6 @@ resources:
|
|||||||
- node-image-sweeper-daemonset.yaml
|
- node-image-sweeper-daemonset.yaml
|
||||||
- image-sweeper-cronjob.yaml
|
- image-sweeper-cronjob.yaml
|
||||||
- metis-service.yaml
|
- metis-service.yaml
|
||||||
- soteria-networkpolicy.yaml
|
|
||||||
- oauth2-proxy-soteria-networkpolicy.yaml
|
|
||||||
- soteria-ingress.yaml
|
|
||||||
- soteria-certificate.yaml
|
|
||||||
- oauth2-proxy-soteria.yaml
|
|
||||||
- oauth2-proxy-metis.yaml
|
- oauth2-proxy-metis.yaml
|
||||||
- metis-certificate.yaml
|
- metis-certificate.yaml
|
||||||
- metis-ingress.yaml
|
- metis-ingress.yaml
|
||||||
@ -48,8 +43,6 @@ images:
|
|||||||
newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
|
newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
|
||||||
- name: registry.bstein.dev/bstein/metis
|
- name: registry.bstein.dev/bstein/metis
|
||||||
newTag: 0.1.0-9-amd64
|
newTag: 0.1.0-9-amd64
|
||||||
- name: registry.bstein.dev/bstein/soteria
|
|
||||||
newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:soteria:tag"}
|
|
||||||
configMapGenerator:
|
configMapGenerator:
|
||||||
- name: disable-k3s-traefik-script
|
- name: disable-k3s-traefik-script
|
||||||
namespace: maintenance
|
namespace: maintenance
|
||||||
|
|||||||
@ -1,23 +0,0 @@
|
|||||||
# services/maintenance/oauth2-proxy-soteria-networkpolicy.yaml
|
|
||||||
apiVersion: networking.k8s.io/v1
|
|
||||||
kind: NetworkPolicy
|
|
||||||
metadata:
|
|
||||||
name: oauth2-proxy-soteria-ingress
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
podSelector:
|
|
||||||
matchLabels:
|
|
||||||
app: oauth2-proxy-soteria
|
|
||||||
policyTypes:
|
|
||||||
- Ingress
|
|
||||||
ingress:
|
|
||||||
- from:
|
|
||||||
- namespaceSelector:
|
|
||||||
matchLabels:
|
|
||||||
kubernetes.io/metadata.name: traefik
|
|
||||||
podSelector:
|
|
||||||
matchLabels:
|
|
||||||
app: traefik
|
|
||||||
ports:
|
|
||||||
- protocol: TCP
|
|
||||||
port: 4180
|
|
||||||
@ -1,120 +0,0 @@
|
|||||||
# services/maintenance/oauth2-proxy-soteria.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: oauth2-proxy-soteria
|
|
||||||
namespace: maintenance
|
|
||||||
labels:
|
|
||||||
app: oauth2-proxy-soteria
|
|
||||||
spec:
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
port: 80
|
|
||||||
targetPort: 4180
|
|
||||||
selector:
|
|
||||||
app: oauth2-proxy-soteria
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: oauth2-proxy-soteria
|
|
||||||
namespace: maintenance
|
|
||||||
labels:
|
|
||||||
app: oauth2-proxy-soteria
|
|
||||||
spec:
|
|
||||||
replicas: 2
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: oauth2-proxy-soteria
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: oauth2-proxy-soteria
|
|
||||||
annotations:
|
|
||||||
vault.hashicorp.com/agent-inject: "true"
|
|
||||||
vault.hashicorp.com/role: "maintenance"
|
|
||||||
vault.hashicorp.com/agent-inject-secret-oidc-config: "kv/data/atlas/maintenance/soteria-oidc"
|
|
||||||
vault.hashicorp.com/agent-inject-template-oidc-config: |
|
|
||||||
{{- with secret "kv/data/atlas/maintenance/soteria-oidc" -}}
|
|
||||||
client_id = "{{ .Data.data.client_id }}"
|
|
||||||
client_secret = "{{ .Data.data.client_secret }}"
|
|
||||||
cookie_secret = "{{ .Data.data.cookie_secret }}"
|
|
||||||
{{- end -}}
|
|
||||||
spec:
|
|
||||||
serviceAccountName: maintenance-vault-sync
|
|
||||||
nodeSelector:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
- key: kubernetes.io/arch
|
|
||||||
operator: In
|
|
||||||
values: ["amd64","arm64"]
|
|
||||||
preferredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
- weight: 100
|
|
||||||
preference:
|
|
||||||
matchExpressions:
|
|
||||||
- key: hardware
|
|
||||||
operator: In
|
|
||||||
values: ["rpi5"]
|
|
||||||
- weight: 100
|
|
||||||
preference:
|
|
||||||
matchExpressions:
|
|
||||||
- key: kubernetes.io/hostname
|
|
||||||
operator: NotIn
|
|
||||||
values: ["titan-13","titan-15","titan-17","titan-19"]
|
|
||||||
containers:
|
|
||||||
- name: oauth2-proxy
|
|
||||||
image: quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
args:
|
|
||||||
- --provider=oidc
|
|
||||||
- --config=/vault/secrets/oidc-config
|
|
||||||
- --redirect-url=https://backup.bstein.dev/oauth2/callback
|
|
||||||
- --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
|
|
||||||
- --scope=openid profile email groups
|
|
||||||
- --email-domain=*
|
|
||||||
- --allowed-group=admin
|
|
||||||
- --allowed-group=/admin
|
|
||||||
- --allowed-group=maintenance
|
|
||||||
- --allowed-group=/maintenance
|
|
||||||
- --set-xauthrequest=true
|
|
||||||
- --pass-user-headers=true
|
|
||||||
- --cookie-secure=true
|
|
||||||
- --cookie-samesite=lax
|
|
||||||
- --cookie-refresh=20m
|
|
||||||
- --cookie-expire=168h
|
|
||||||
- --insecure-oidc-allow-unverified-email=true
|
|
||||||
- --upstream=http://soteria.maintenance.svc.cluster.local
|
|
||||||
- --http-address=0.0.0.0:4180
|
|
||||||
- --skip-provider-button=true
|
|
||||||
- --approval-prompt=auto
|
|
||||||
- --skip-jwt-bearer-tokens=true
|
|
||||||
- --oidc-groups-claim=groups
|
|
||||||
- --cookie-domain=backup.bstein.dev
|
|
||||||
ports:
|
|
||||||
- containerPort: 4180
|
|
||||||
name: http
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /ping
|
|
||||||
port: 4180
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 10
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /ping
|
|
||||||
port: 4180
|
|
||||||
initialDelaySeconds: 20
|
|
||||||
periodSeconds: 20
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 25m
|
|
||||||
memory: 64Mi
|
|
||||||
limits:
|
|
||||||
cpu: 250m
|
|
||||||
memory: 256Mi
|
|
||||||
@ -1,13 +0,0 @@
|
|||||||
# services/maintenance/soteria-certificate.yaml
|
|
||||||
apiVersion: cert-manager.io/v1
|
|
||||||
kind: Certificate
|
|
||||||
metadata:
|
|
||||||
name: backup-tls
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
secretName: backup-tls
|
|
||||||
issuerRef:
|
|
||||||
kind: ClusterIssuer
|
|
||||||
name: letsencrypt
|
|
||||||
dnsNames:
|
|
||||||
- backup.bstein.dev
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
# services/maintenance/soteria-configmap.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: soteria
|
|
||||||
namespace: maintenance
|
|
||||||
data:
|
|
||||||
SOTERIA_BACKUP_DRIVER: longhorn
|
|
||||||
SOTERIA_LONGHORN_URL: http://longhorn-backend.longhorn-system.svc:9500
|
|
||||||
SOTERIA_LONGHORN_BACKUP_MODE: incremental
|
|
||||||
SOTERIA_AUTH_REQUIRED: "true"
|
|
||||||
SOTERIA_ALLOWED_GROUPS: admin,maintenance
|
|
||||||
SOTERIA_BACKUP_MAX_AGE_HOURS: "24"
|
|
||||||
SOTERIA_METRICS_REFRESH_SECONDS: "300"
|
|
||||||
@ -1,76 +0,0 @@
|
|||||||
# services/maintenance/soteria-deployment.yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: soteria
|
|
||||||
namespace: maintenance
|
|
||||||
labels:
|
|
||||||
app: soteria
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
revisionHistoryLimit: 3
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: soteria
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: soteria
|
|
||||||
spec:
|
|
||||||
serviceAccountName: soteria
|
|
||||||
nodeSelector:
|
|
||||||
kubernetes.io/arch: arm64
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
preferredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
- weight: 90
|
|
||||||
preference:
|
|
||||||
matchExpressions:
|
|
||||||
- key: hardware
|
|
||||||
operator: In
|
|
||||||
values: ["rpi5"]
|
|
||||||
- weight: 50
|
|
||||||
preference:
|
|
||||||
matchExpressions:
|
|
||||||
- key: hardware
|
|
||||||
operator: In
|
|
||||||
values: ["rpi4"]
|
|
||||||
containers:
|
|
||||||
- name: soteria
|
|
||||||
image: registry.bstein.dev/bstein/soteria:0.1.0-21
|
|
||||||
imagePullPolicy: Always
|
|
||||||
envFrom:
|
|
||||||
- configMapRef:
|
|
||||||
name: soteria
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
containerPort: 8080
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /healthz
|
|
||||||
port: http
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 10
|
|
||||||
timeoutSeconds: 2
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /readyz
|
|
||||||
port: http
|
|
||||||
initialDelaySeconds: 2
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 2
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 50m
|
|
||||||
memory: 64Mi
|
|
||||||
limits:
|
|
||||||
cpu: 200m
|
|
||||||
memory: 256Mi
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
capabilities:
|
|
||||||
drop: ["ALL"]
|
|
||||||
runAsNonRoot: true
|
|
||||||
runAsUser: 65532
|
|
||||||
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
# services/maintenance/soteria-ingress.yaml
|
|
||||||
apiVersion: networking.k8s.io/v1
|
|
||||||
kind: Ingress
|
|
||||||
metadata:
|
|
||||||
name: soteria
|
|
||||||
namespace: maintenance
|
|
||||||
annotations:
|
|
||||||
kubernetes.io/ingress.class: traefik
|
|
||||||
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
|
||||||
traefik.ingress.kubernetes.io/router.tls: "true"
|
|
||||||
traefik.ingress.kubernetes.io/router.middlewares: ""
|
|
||||||
spec:
|
|
||||||
ingressClassName: traefik
|
|
||||||
tls:
|
|
||||||
- hosts: ["backup.bstein.dev"]
|
|
||||||
secretName: backup-tls
|
|
||||||
rules:
|
|
||||||
- host: backup.bstein.dev
|
|
||||||
http:
|
|
||||||
paths:
|
|
||||||
- path: /
|
|
||||||
pathType: Prefix
|
|
||||||
backend:
|
|
||||||
service:
|
|
||||||
name: oauth2-proxy-soteria
|
|
||||||
port:
|
|
||||||
number: 80
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
# services/maintenance/soteria-networkpolicy.yaml
|
|
||||||
apiVersion: networking.k8s.io/v1
|
|
||||||
kind: NetworkPolicy
|
|
||||||
metadata:
|
|
||||||
name: soteria-ingress
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
podSelector:
|
|
||||||
matchLabels:
|
|
||||||
app: soteria
|
|
||||||
policyTypes:
|
|
||||||
- Ingress
|
|
||||||
ingress:
|
|
||||||
- from:
|
|
||||||
- podSelector:
|
|
||||||
matchLabels:
|
|
||||||
app: oauth2-proxy-soteria
|
|
||||||
ports:
|
|
||||||
- protocol: TCP
|
|
||||||
port: 8080
|
|
||||||
- from:
|
|
||||||
- namespaceSelector:
|
|
||||||
matchLabels:
|
|
||||||
kubernetes.io/metadata.name: monitoring
|
|
||||||
ports:
|
|
||||||
- protocol: TCP
|
|
||||||
port: 8080
|
|
||||||
@ -1,29 +0,0 @@
|
|||||||
# services/maintenance/soteria-rbac.yaml
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRole
|
|
||||||
metadata:
|
|
||||||
name: soteria
|
|
||||||
rules:
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources: ["persistentvolumeclaims", "persistentvolumes"]
|
|
||||||
verbs: ["get", "list"]
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources: ["secrets"]
|
|
||||||
verbs: ["get", "list", "create", "update", "delete"]
|
|
||||||
- apiGroups: ["batch"]
|
|
||||||
resources: ["jobs"]
|
|
||||||
verbs: ["get", "list", "create", "delete"]
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRoleBinding
|
|
||||||
metadata:
|
|
||||||
name: soteria
|
|
||||||
roleRef:
|
|
||||||
apiGroup: rbac.authorization.k8s.io
|
|
||||||
kind: ClusterRole
|
|
||||||
name: soteria
|
|
||||||
subjects:
|
|
||||||
- kind: ServiceAccount
|
|
||||||
name: soteria
|
|
||||||
namespace: maintenance
|
|
||||||
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
# services/maintenance/soteria-service.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: soteria
|
|
||||||
namespace: maintenance
|
|
||||||
labels:
|
|
||||||
app: soteria
|
|
||||||
annotations:
|
|
||||||
prometheus.io/scrape: "true"
|
|
||||||
prometheus.io/port: "80"
|
|
||||||
prometheus.io/path: "/metrics"
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
selector:
|
|
||||||
app: soteria
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
port: 80
|
|
||||||
targetPort: http
|
|
||||||
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
# services/maintenance/soteria-serviceaccount.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
name: soteria
|
|
||||||
namespace: maintenance
|
|
||||||
imagePullSecrets:
|
|
||||||
- name: harbor-regcred
|
|
||||||
|
|
||||||
@ -1970,7 +1970,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))",
|
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}/{{pvc}}",
|
"legendFormat": "{{namespace}}/{{pvc}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -2034,7 +2034,7 @@
|
|||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
|
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 30,
|
"id": 30,
|
||||||
|
|||||||
@ -447,150 +447,6 @@ data:
|
|||||||
summary: "Legacy cronjob alert disabled"
|
summary: "Legacy cronjob alert disabled"
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
- uid: maint-soteria-refresh-stale
|
|
||||||
title: "Soteria inventory refresh stale (>15m)"
|
|
||||||
condition: C
|
|
||||||
for: "15m"
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange:
|
|
||||||
from: 900
|
|
||||||
to: 0
|
|
||||||
datasourceUid: atlas-vm
|
|
||||||
model:
|
|
||||||
expr: time() - soteria_inventory_refresh_timestamp_seconds
|
|
||||||
intervalMs: 60000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
legendFormat: soteria-refresh-age-seconds
|
|
||||||
datasource:
|
|
||||||
type: prometheus
|
|
||||||
uid: atlas-vm
|
|
||||||
- refId: B
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model:
|
|
||||||
expression: A
|
|
||||||
intervalMs: 60000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
reducer: last
|
|
||||||
type: reduce
|
|
||||||
- refId: C
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model:
|
|
||||||
expression: B
|
|
||||||
intervalMs: 60000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
type: threshold
|
|
||||||
conditions:
|
|
||||||
- evaluator:
|
|
||||||
params: [900]
|
|
||||||
type: gt
|
|
||||||
operator:
|
|
||||||
type: and
|
|
||||||
reducer:
|
|
||||||
type: last
|
|
||||||
type: query
|
|
||||||
noDataState: Alerting
|
|
||||||
execErrState: Alerting
|
|
||||||
annotations:
|
|
||||||
summary: "Soteria inventory telemetry has not refreshed in >15m"
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- uid: maint-soteria-backup-unhealthy
|
|
||||||
title: "Soteria reports unhealthy PVC backups"
|
|
||||||
condition: C
|
|
||||||
for: "10m"
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange:
|
|
||||||
from: 600
|
|
||||||
to: 0
|
|
||||||
datasourceUid: atlas-vm
|
|
||||||
model:
|
|
||||||
expr: sum((1 - pvc_backup_health{driver="longhorn"}) > bool 0) or on() vector(0)
|
|
||||||
intervalMs: 60000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
legendFormat: unhealthy-pvcs
|
|
||||||
datasource:
|
|
||||||
type: prometheus
|
|
||||||
uid: atlas-vm
|
|
||||||
- refId: B
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model:
|
|
||||||
expression: A
|
|
||||||
intervalMs: 60000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
reducer: last
|
|
||||||
type: reduce
|
|
||||||
- refId: C
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model:
|
|
||||||
expression: B
|
|
||||||
intervalMs: 60000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
type: threshold
|
|
||||||
conditions:
|
|
||||||
- evaluator:
|
|
||||||
params: [0]
|
|
||||||
type: gt
|
|
||||||
operator:
|
|
||||||
type: and
|
|
||||||
reducer:
|
|
||||||
type: last
|
|
||||||
type: query
|
|
||||||
noDataState: OK
|
|
||||||
execErrState: Alerting
|
|
||||||
annotations:
|
|
||||||
summary: "One or more PVCs are stale, missing, or failed per Soteria backup health"
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- uid: maint-soteria-authz-denials
|
|
||||||
title: "Soteria authorization denials elevated"
|
|
||||||
condition: C
|
|
||||||
for: "10m"
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange:
|
|
||||||
from: 900
|
|
||||||
to: 0
|
|
||||||
datasourceUid: atlas-vm
|
|
||||||
model:
|
|
||||||
expr: sum(increase(soteria_authz_denials_total[15m])) or on() vector(0)
|
|
||||||
intervalMs: 60000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
legendFormat: soteria-authz-denials-15m
|
|
||||||
datasource:
|
|
||||||
type: prometheus
|
|
||||||
uid: atlas-vm
|
|
||||||
- refId: B
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model:
|
|
||||||
expression: A
|
|
||||||
intervalMs: 60000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
reducer: last
|
|
||||||
type: reduce
|
|
||||||
- refId: C
|
|
||||||
datasourceUid: __expr__
|
|
||||||
model:
|
|
||||||
expression: B
|
|
||||||
intervalMs: 60000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
type: threshold
|
|
||||||
conditions:
|
|
||||||
- evaluator:
|
|
||||||
params: [10]
|
|
||||||
type: gt
|
|
||||||
operator:
|
|
||||||
type: and
|
|
||||||
reducer:
|
|
||||||
type: last
|
|
||||||
type: query
|
|
||||||
noDataState: OK
|
|
||||||
execErrState: Alerting
|
|
||||||
annotations:
|
|
||||||
summary: "Soteria saw >10 authorization denials in 15m"
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
name: ariadne
|
name: ariadne
|
||||||
folder: Alerts
|
folder: Alerts
|
||||||
|
|||||||
@ -1979,7 +1979,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))",
|
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}/{{pvc}}",
|
"legendFormat": "{{namespace}}/{{pvc}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -2043,7 +2043,7 @@ data:
|
|||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
|
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 30,
|
"id": 30,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user