From e7213d9d1c829dd4219c28ed8bc6e2ef196352a8 Mon Sep 17 00:00:00 2001 From: jenkins Date: Sun, 10 May 2026 16:13:13 -0300 Subject: [PATCH] monitoring: fill pre-telemetry availability --- scripts/dashboards_render_atlas.py | 5 +-- scripts/tests/test_dashboards_render_atlas.py | 3 +- .../monitoring/dashboards/atlas-overview.json | 4 +-- .../grafana-dashboard-overview.yaml | 4 +-- .../vmalert-atlas-availability.yaml | 31 +++++++++++-------- 5 files changed, 27 insertions(+), 20 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 45bc9dad..c55ba96c 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -314,7 +314,8 @@ STUCK_TERMINATING_EXPR = ( ) UPTIME_WINDOW = "365d" # vmalert precomputes the expensive long-window rollup so Grafana only reads one compact series. -UPTIME_RECORDING_EXPR = f'atlas:availability:ratio_{UPTIME_WINDOW}{{scope="atlas"}}' +UPTIME_RECORDING_METRIC = f'atlas:availability:ratio_{UPTIME_WINDOW}{{scope="atlas"}}' +UPTIME_RECORDING_EXPR = f"last_over_time({UPTIME_RECORDING_METRIC}[30m])" TRAEFIK_READY_EXPR = ( "(" 'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})' @@ -1441,7 +1442,7 @@ def build_overview(): "decimals": 4, "text_mode": "value", "instant": True, - "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load.", + "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples.", }, { "id": 4, diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py index 2c806623..162827aa 100644 --- a/scripts/tests/test_dashboards_render_atlas.py +++ b/scripts/tests/test_dashboards_render_atlas.py @@ -56,9 +56,10 @@ def test_overview_availability_panel_uses_recorded_365d_rollup(): panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27) assert panel["title"] == "Atlas Availability (365d)" - assert panel["targets"][0]["expr"] == 'atlas:availability:ratio_365d{scope="atlas"}' + assert panel["targets"][0]["expr"] == 'last_over_time(atlas:availability:ratio_365d{scope="atlas"}[30m])' assert panel["targets"][0]["instant"] is True assert "precomputed" in panel["description"] + assert "pre-telemetry" in panel["description"] def test_render_configmap_writes(tmp_path): diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index e9226de3..373f456e 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -226,7 +226,7 @@ }, "targets": [ { - "expr": "atlas:availability:ratio_365d{scope=\"atlas\"}", + "expr": "last_over_time(atlas:availability:ratio_365d{scope=\"atlas\"}[30m])", "refId": "A", "instant": true } @@ -283,7 +283,7 @@ }, "textMode": "value" }, - "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load." + "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples." }, { "id": 4, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index c28addae..64d3bd88 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -235,7 +235,7 @@ data: }, "targets": [ { - "expr": "atlas:availability:ratio_365d{scope=\"atlas\"}", + "expr": "last_over_time(atlas:availability:ratio_365d{scope=\"atlas\"}[30m])", "refId": "A", "instant": true } @@ -292,7 +292,7 @@ data: }, "textMode": "value" }, - "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load." + "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples." }, { "id": 4, diff --git a/services/monitoring/vmalert-atlas-availability.yaml b/services/monitoring/vmalert-atlas-availability.yaml index 5f986265..272fe314 100644 --- a/services/monitoring/vmalert-atlas-availability.yaml +++ b/services/monitoring/vmalert-atlas-availability.yaml @@ -9,7 +9,7 @@ data: groups: - name: atlas.availability interval: 15m - eval_offset: 7m + eval_offset: 14m rules: - record: atlas:availability:ratio_1h expr: | @@ -30,18 +30,23 @@ data: rollup: hourly - record: atlas:availability:ratio_365d expr: | - avg_over_time(( - min( - ( - sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"}) - / 3 - ), - ( - sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"}) - / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1) - ) + ( + ( + sum(sum_over_time(atlas:availability:ratio_1h{scope="atlas",rollup="hourly"}[365d])) + or on() vector(0) ) - )[365d:6h]) + + + clamp_min( + 35040 + - + ( + sum(count_over_time(atlas:availability:ratio_1h{scope="atlas",rollup="hourly"}[365d])) + or on() vector(0) + ), + 0 + ) + ) + / 35040 labels: scope: atlas rollup: yearly @@ -74,7 +79,7 @@ spec: labels: app: vmalert-atlas-availability annotations: - bstein.dev/rules-revision: "2026-05-10-availability-rollup-v2" + bstein.dev/rules-revision: "2026-05-10-availability-rollup-v4" spec: serviceAccountName: vmalert-atlas-availability affinity: