diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 1f1db361..ca99bf98 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -475,7 +475,7 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = ( f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) ' f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))' ) -PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))" +PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))" ANANKE_SELECTOR = 'job="ananke-power"' ANANKE_UPS_DB_NAME = "Pyrphoros" ANANKE_UPS_DB_NODE = "titan-db" @@ -1566,7 +1566,7 @@ def build_overview(): ) panels[-1]["links"] = link_to("atlas-storage") panels[-1]["description"] = ( - "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published." + "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview." ) panels.append( diff --git a/services/keycloak/kustomization.yaml b/services/keycloak/kustomization.yaml index c532a6fa..07b372bd 100644 --- a/services/keycloak/kustomization.yaml +++ b/services/keycloak/kustomization.yaml @@ -23,6 +23,7 @@ resources: - oneoffs/synapse-oidc-secret-ensure-job.yaml - oneoffs/logs-oidc-secret-ensure-job.yaml - oneoffs/metis-oidc-secret-ensure-job.yaml + - oneoffs/soteria-oidc-secret-ensure-job.yaml - oneoffs/metis-ssh-keys-secret-ensure-job.yaml - oneoffs/harbor-oidc-secret-ensure-job.yaml - oneoffs/vault-oidc-secret-ensure-job.yaml diff --git a/services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml new file mode 100644 index 00000000..d0616e29 --- /dev/null +++ b/services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml @@ -0,0 +1,198 @@ +# services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml +# One-off job for sso/soteria-oidc-secret-ensure-1. +# Purpose: ensure the Soteria oauth2-proxy OIDC client and Vault secret exist. +# Keep this completed Job around; bump the suffix if it ever needs to be rerun. +apiVersion: batch/v1 +kind: Job +metadata: + name: soteria-oidc-secret-ensure-1 + namespace: sso +spec: + backoffLimit: 0 + template: + metadata: + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/agent-pre-populate-only: "true" + vault.hashicorp.com/role: "sso-secrets" + vault.hashicorp.com/agent-inject-secret-keycloak-admin-env.sh: "kv/data/atlas/shared/keycloak-admin" + vault.hashicorp.com/agent-inject-template-keycloak-admin-env.sh: | + {{ with secret "kv/data/atlas/shared/keycloak-admin" }} + export KEYCLOAK_ADMIN="{{ .Data.data.username }}" + export KEYCLOAK_ADMIN_USER="{{ .Data.data.username }}" + export KEYCLOAK_ADMIN_PASSWORD="{{ .Data.data.password }}" + {{ end }} + spec: + serviceAccountName: mas-secrets-ensure + restartPolicy: Never + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: kubernetes.io/arch + operator: In + values: ["arm64"] + containers: + - name: apply + image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + . /vault/secrets/keycloak-admin-env.sh + KC_URL="http://keycloak.sso.svc.cluster.local" + ACCESS_TOKEN="" + for attempt in 1 2 3 4 5; do + TOKEN_JSON="$(curl -sS -X POST "$KC_URL/realms/master/protocol/openid-connect/token" \ + -H 'Content-Type: application/x-www-form-urlencoded' \ + -d "grant_type=password" \ + -d "client_id=admin-cli" \ + -d "username=${KEYCLOAK_ADMIN}" \ + -d "password=${KEYCLOAK_ADMIN_PASSWORD}" || true)" + ACCESS_TOKEN="$(echo "$TOKEN_JSON" | jq -r '.access_token' 2>/dev/null || true)" + if [ -n "$ACCESS_TOKEN" ] && [ "$ACCESS_TOKEN" != "null" ]; then + break + fi + echo "Keycloak token request failed (attempt ${attempt})" >&2 + sleep $((attempt * 2)) + done + if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then + echo "Failed to fetch Keycloak admin token" >&2 + exit 1 + fi + + CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients?clientId=soteria" || true)" + CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)" + + if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then + create_payload='{"clientId":"soteria","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://backup.bstein.dev/oauth2/callback"],"webOrigins":["https://backup.bstein.dev"],"rootUrl":"https://backup.bstein.dev","baseUrl":"/"}' + status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + -H 'Content-Type: application/json' \ + -d "${create_payload}" \ + "$KC_URL/admin/realms/atlas/clients")" + if [ "$status" != "201" ] && [ "$status" != "204" ] && [ "$status" != "409" ]; then + echo "Keycloak client create failed (status ${status})" >&2 + exit 1 + fi + CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients?clientId=soteria" || true)" + CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)" + fi + + if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then + echo "Keycloak client soteria not found" >&2 + exit 1 + fi + + SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/client-scopes?search=groups" | jq -r '.[] | select(.name=="groups") | .id' 2>/dev/null | head -n1 || true)" + if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then + echo "Keycloak client scope groups not found" >&2 + exit 1 + fi + + DEFAULT_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/default-client-scopes" || true)" + OPTIONAL_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes" || true)" + + if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1 \ + && ! echo "$OPTIONAL_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1; then + status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")" + if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then + status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")" + if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then + echo "Failed to attach groups client scope to soteria (status ${status})" >&2 + exit 1 + fi + fi + fi + + update_payload='{"enabled":true,"clientId":"soteria","protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://backup.bstein.dev/oauth2/callback"],"webOrigins":["https://backup.bstein.dev"],"rootUrl":"https://backup.bstein.dev","baseUrl":"/"}' + status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + -H 'Content-Type: application/json' \ + -d "${update_payload}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}")" + if [ "$status" != "204" ]; then + echo "Keycloak client update failed (status ${status})" >&2 + exit 1 + fi + + CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)" + if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then + echo "Keycloak client secret not found" >&2 + exit 1 + fi + + vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}" + vault_role="${VAULT_ROLE:-sso-secrets}" + jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" + login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')" + vault_token="$(curl -sS --request POST --data "${login_payload}" \ + "${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')" + if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then + echo "vault login failed" >&2 + exit 1 + fi + + read_status="$(curl -sS -o /tmp/soteria-oidc-read.json -w "%{http_code}" \ + -H "X-Vault-Token: ${vault_token}" \ + "${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc" || true)" + COOKIE_SECRET="" + if [ "${read_status}" = "200" ]; then + COOKIE_SECRET="$(jq -r '.data.data.cookie_secret // empty' /tmp/soteria-oidc-read.json)" + elif [ "${read_status}" != "404" ]; then + echo "Vault read failed (status ${read_status})" >&2 + cat /tmp/soteria-oidc-read.json >&2 || true + exit 1 + fi + if [ -n "${COOKIE_SECRET}" ]; then + length="$(printf '%s' "${COOKIE_SECRET}" | wc -c | tr -d ' ')" + if [ "${length}" != "16" ] && [ "${length}" != "24" ] && [ "${length}" != "32" ]; then + COOKIE_SECRET="" + fi + fi + if [ -z "${COOKIE_SECRET}" ]; then + COOKIE_SECRET="$(openssl rand -hex 16 | tr -d '\n')" + fi + + payload="$(jq -nc \ + --arg client_id "soteria" \ + --arg client_secret "${CLIENT_SECRET}" \ + --arg cookie_secret "${COOKIE_SECRET}" \ + '{data:{client_id:$client_id,client_secret:$client_secret,cookie_secret:$cookie_secret}}')" + write_status="$(curl -sS -o /tmp/soteria-oidc-write.json -w "%{http_code}" -X POST \ + -H "X-Vault-Token: ${vault_token}" \ + -H 'Content-Type: application/json' \ + -d "${payload}" "${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc")" + if [ "${write_status}" != "200" ] && [ "${write_status}" != "204" ]; then + echo "Vault write failed (status ${write_status})" >&2 + cat /tmp/soteria-oidc-write.json >&2 || true + exit 1 + fi + + verify_status="$(curl -sS -o /tmp/soteria-oidc-verify.json -w "%{http_code}" \ + -H "X-Vault-Token: ${vault_token}" \ + "${vault_addr}/v1/kv/data/atlas/maintenance/soteria-oidc" || true)" + if [ "${verify_status}" != "200" ]; then + echo "Vault verify failed (status ${verify_status})" >&2 + cat /tmp/soteria-oidc-verify.json >&2 || true + exit 1 + fi + + echo "Soteria OIDC secret ready in Vault" diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index a7b6d82a..56892aa5 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -35,6 +35,9 @@ resources: - node-image-sweeper-daemonset.yaml - image-sweeper-cronjob.yaml - metis-service.yaml + - soteria-ingress.yaml + - soteria-certificate.yaml + - oauth2-proxy-soteria.yaml - oauth2-proxy-metis.yaml - metis-certificate.yaml - metis-ingress.yaml @@ -43,6 +46,8 @@ images: newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"} - name: registry.bstein.dev/bstein/metis newTag: 0.1.0-9-amd64 + - name: registry.bstein.dev/bstein/soteria + newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:soteria:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance diff --git a/services/maintenance/oauth2-proxy-soteria.yaml b/services/maintenance/oauth2-proxy-soteria.yaml new file mode 100644 index 00000000..cd6dabe7 --- /dev/null +++ b/services/maintenance/oauth2-proxy-soteria.yaml @@ -0,0 +1,121 @@ +# services/maintenance/oauth2-proxy-soteria.yaml +apiVersion: v1 +kind: Service +metadata: + name: oauth2-proxy-soteria + namespace: maintenance + labels: + app: oauth2-proxy-soteria +spec: + ports: + - name: http + port: 80 + targetPort: 4180 + selector: + app: oauth2-proxy-soteria + +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: oauth2-proxy-soteria + namespace: maintenance + labels: + app: oauth2-proxy-soteria +spec: + replicas: 2 + selector: + matchLabels: + app: oauth2-proxy-soteria + template: + metadata: + labels: + app: oauth2-proxy-soteria + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/role: "maintenance" + vault.hashicorp.com/agent-inject-secret-oidc-config: "kv/data/atlas/maintenance/soteria-oidc" + vault.hashicorp.com/agent-inject-template-oidc-config: | + {{- with secret "kv/data/atlas/maintenance/soteria-oidc" -}} + client_id = "{{ .Data.data.client_id }}" + client_secret = "{{ .Data.data.client_secret }}" + cookie_secret = "{{ .Data.data.cookie_secret }}" + {{- end -}} + spec: + serviceAccountName: maintenance-vault-sync + nodeSelector: + node-role.kubernetes.io/worker: "true" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/arch + operator: In + values: ["amd64","arm64"] + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: hardware + operator: In + values: ["rpi5"] + - weight: 100 + preference: + matchExpressions: + - key: kubernetes.io/hostname + operator: NotIn + values: ["titan-13","titan-15","titan-17","titan-19"] + containers: + - name: oauth2-proxy + image: quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + imagePullPolicy: IfNotPresent + args: + - --provider=oidc + - --config=/vault/secrets/oidc-config + - --redirect-url=https://backup.bstein.dev/oauth2/callback + - --oidc-issuer-url=https://sso.bstein.dev/realms/atlas + - --scope=openid profile email groups + - --email-domain=* + - --allowed-group=admin + - --allowed-group=/admin + - --allowed-group=maintenance + - --allowed-group=/maintenance + - --set-xauthrequest=true + - --pass-access-token=true + - --set-authorization-header=true + - --cookie-secure=true + - --cookie-samesite=lax + - --cookie-refresh=20m + - --cookie-expire=168h + - --insecure-oidc-allow-unverified-email=true + - --upstream=http://soteria.maintenance.svc.cluster.local + - --http-address=0.0.0.0:4180 + - --skip-provider-button=true + - --approval-prompt=auto + - --skip-jwt-bearer-tokens=true + - --oidc-groups-claim=groups + - --cookie-domain=backup.bstein.dev + ports: + - containerPort: 4180 + name: http + readinessProbe: + httpGet: + path: /ping + port: 4180 + initialDelaySeconds: 5 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /ping + port: 4180 + initialDelaySeconds: 20 + periodSeconds: 20 + resources: + requests: + cpu: 25m + memory: 64Mi + limits: + cpu: 250m + memory: 256Mi diff --git a/services/maintenance/soteria-certificate.yaml b/services/maintenance/soteria-certificate.yaml new file mode 100644 index 00000000..0c328fca --- /dev/null +++ b/services/maintenance/soteria-certificate.yaml @@ -0,0 +1,13 @@ +# services/maintenance/soteria-certificate.yaml +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: backup-tls + namespace: maintenance +spec: + secretName: backup-tls + issuerRef: + kind: ClusterIssuer + name: letsencrypt + dnsNames: + - backup.bstein.dev diff --git a/services/maintenance/soteria-configmap.yaml b/services/maintenance/soteria-configmap.yaml new file mode 100644 index 00000000..fdf666e9 --- /dev/null +++ b/services/maintenance/soteria-configmap.yaml @@ -0,0 +1,14 @@ +# services/maintenance/soteria-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: soteria + namespace: maintenance +data: + SOTERIA_BACKUP_DRIVER: longhorn + SOTERIA_LONGHORN_URL: http://longhorn-backend.longhorn-system.svc:9500 + SOTERIA_LONGHORN_BACKUP_MODE: incremental + SOTERIA_AUTH_REQUIRED: "true" + SOTERIA_ALLOWED_GROUPS: admin,maintenance + SOTERIA_BACKUP_MAX_AGE_HOURS: "24" + SOTERIA_METRICS_REFRESH_SECONDS: "300" diff --git a/services/maintenance/soteria-deployment.yaml b/services/maintenance/soteria-deployment.yaml new file mode 100644 index 00000000..75fe8fd4 --- /dev/null +++ b/services/maintenance/soteria-deployment.yaml @@ -0,0 +1,76 @@ +# services/maintenance/soteria-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: soteria + namespace: maintenance + labels: + app: soteria +spec: + replicas: 1 + revisionHistoryLimit: 3 + selector: + matchLabels: + app: soteria + template: + metadata: + labels: + app: soteria + spec: + serviceAccountName: soteria + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 90 + preference: + matchExpressions: + - key: hardware + operator: In + values: ["rpi5"] + - weight: 50 + preference: + matchExpressions: + - key: hardware + operator: In + values: ["rpi4"] + containers: + - name: soteria + image: registry.bstein.dev/bstein/soteria:0.1.0-21 + imagePullPolicy: Always + envFrom: + - configMapRef: + name: soteria + ports: + - name: http + containerPort: 8080 + livenessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 2 + readinessProbe: + httpGet: + path: /readyz + port: http + initialDelaySeconds: 2 + periodSeconds: 5 + timeoutSeconds: 2 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + runAsUser: 65532 + diff --git a/services/maintenance/soteria-ingress.yaml b/services/maintenance/soteria-ingress.yaml new file mode 100644 index 00000000..ab4b52b9 --- /dev/null +++ b/services/maintenance/soteria-ingress.yaml @@ -0,0 +1,27 @@ +# services/maintenance/soteria-ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: soteria + namespace: maintenance + annotations: + kubernetes.io/ingress.class: traefik + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: "true" + traefik.ingress.kubernetes.io/router.middlewares: "" +spec: + ingressClassName: traefik + tls: + - hosts: ["backup.bstein.dev"] + secretName: backup-tls + rules: + - host: backup.bstein.dev + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: oauth2-proxy-soteria + port: + number: 80 diff --git a/services/maintenance/soteria-service.yaml b/services/maintenance/soteria-service.yaml new file mode 100644 index 00000000..6aef612e --- /dev/null +++ b/services/maintenance/soteria-service.yaml @@ -0,0 +1,21 @@ +# services/maintenance/soteria-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: soteria + namespace: maintenance + labels: + app: soteria + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "80" + prometheus.io/path: "/metrics" +spec: + type: ClusterIP + selector: + app: soteria + ports: + - name: http + port: 80 + targetPort: http + diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index edf2dd85..d1557b7d 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1970,7 +1970,7 @@ }, "targets": [ { - "expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))", + "expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))", "refId": "A", "legendFormat": "{{namespace}}/{{pvc}}", "instant": true @@ -2034,7 +2034,7 @@ "targetBlank": true } ], - "description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published." + "description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview." }, { "id": 30, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 3a3659a5..43844351 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1979,7 +1979,7 @@ data: }, "targets": [ { - "expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))", + "expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))", "refId": "A", "legendFormat": "{{namespace}}/{{pvc}}", "instant": true @@ -2043,7 +2043,7 @@ data: "targetBlank": true } ], - "description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published." + "description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview." }, { "id": 30,