monitoring(overview): fix pvc backup health/age panel query

This commit is contained in:
Brad Stein 2026-04-13 05:33:28 -03:00
parent db701b89c2
commit a2172f56ec
3 changed files with 11 additions and 6 deletions

View File

@ -581,7 +581,11 @@ QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE = (
QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE_WITH_MISSING = (
f"({QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE}) or on(suite) (0 * ({QUALITY_GATE_SUITE_INDEX_30D}) - 1)"
)
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))"
PVC_BACKUP_AGE_HOURS_BY_PVC = (
'sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver="restic"}) / 3600) '
'or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver="restic",reason=~"missing|no_completed|lookup_failed|unknown_timestamp"} > 0) '
'* (pvc_backup_count{driver="restic"} > bool 0)) * 999))))'
)
ANANKE_SELECTOR = 'job="ananke-power"'
ANANKE_UPS_DB_NAME = "Pyrphoros"
ANANKE_UPS_DB_NODE = "titan-db"
@ -2168,7 +2172,8 @@ def build_overview():
)
panels[-1]["links"] = link_to("atlas-storage")
panels[-1]["description"] = (
"Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
"Backup age in hours computed from last-success timestamps for restic-managed PVCs. "
"PVCs that have backup history but currently no successful backup (missing/no_completed/error) are pinned to 999h for visibility."
)
panels.append(
jenkins_weather_bargauge_panel(

View File

@ -2642,7 +2642,7 @@
},
"targets": [
{
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))",
"expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
@ -2706,7 +2706,7 @@
"targetBlank": true
}
],
"description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
"description": "Backup age in hours computed from last-success timestamps for restic-managed PVCs. PVCs that have backup history but currently no successful backup (missing/no_completed/error) are pinned to 999h for visibility."
},
{
"id": 142,

View File

@ -2651,7 +2651,7 @@ data:
},
"targets": [
{
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))",
"expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
@ -2715,7 +2715,7 @@ data:
"targetBlank": true
}
],
"description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview."
"description": "Backup age in hours computed from last-success timestamps for restic-managed PVCs. PVCs that have backup history but currently no successful backup (missing/no_completed/error) are pinned to 999h for visibility."
},
{
"id": 142,