From a2172f56ec4c3b625334aac8a446d99b8ca56abb Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 13 Apr 2026 05:33:28 -0300 Subject: [PATCH] monitoring(overview): fix pvc backup health/age panel query --- scripts/dashboards_render_atlas.py | 9 +++++++-- services/monitoring/dashboards/atlas-overview.json | 4 ++-- services/monitoring/grafana-dashboard-overview.yaml | 4 ++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 50c8670f..6043a79a 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -581,7 +581,11 @@ QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE = ( QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE_WITH_MISSING = ( f"({QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE}) or on(suite) (0 * ({QUALITY_GATE_SUITE_INDEX_30D}) - 1)" ) -PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))" +PVC_BACKUP_AGE_HOURS_BY_PVC = ( + 'sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver="restic"}) / 3600) ' + 'or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver="restic",reason=~"missing|no_completed|lookup_failed|unknown_timestamp"} > 0) ' + '* (pvc_backup_count{driver="restic"} > bool 0)) * 999))))' +) ANANKE_SELECTOR = 'job="ananke-power"' ANANKE_UPS_DB_NAME = "Pyrphoros" ANANKE_UPS_DB_NODE = "titan-db" @@ -2168,7 +2172,8 @@ def build_overview(): ) panels[-1]["links"] = link_to("atlas-storage") panels[-1]["description"] = ( - "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview." + "Backup age in hours computed from last-success timestamps for restic-managed PVCs. " + "PVCs that have backup history but currently no successful backup (missing/no_completed/error) are pinned to 999h for visibility." ) panels.append( jenkins_weather_bargauge_panel( diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 23a6b6d7..d7f6703a 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -2642,7 +2642,7 @@ }, "targets": [ { - "expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))", + "expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))))", "refId": "A", "legendFormat": "{{namespace}}/{{pvc}}", "instant": true @@ -2706,7 +2706,7 @@ "targetBlank": true } ], - "description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview." + "description": "Backup age in hours computed from last-success timestamps for restic-managed PVCs. PVCs that have backup history but currently no successful backup (missing/no_completed/error) are pinned to 999h for visibility." }, { "id": 142, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 9336299c..c82cda24 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -2651,7 +2651,7 @@ data: }, "targets": [ { - "expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))", + "expr": "sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver=\"restic\"}) / 3600) or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver=\"restic\",reason=~\"missing|no_completed|lookup_failed|unknown_timestamp\"} > 0) * (pvc_backup_count{driver=\"restic\"} > bool 0)) * 999))))", "refId": "A", "legendFormat": "{{namespace}}/{{pvc}}", "instant": true @@ -2715,7 +2715,7 @@ data: "targetBlank": true } ], - "description": "Oldest successful backup age in hours by PVC. PVCs with missing or unhealthy backup state are forced to 999h so the red bars stay visible in the overview." + "description": "Backup age in hours computed from last-success timestamps for restic-managed PVCs. PVCs that have backup history but currently no successful backup (missing/no_completed/error) are pinned to 999h for visibility." }, { "id": 142,