diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index c55ba96c..bd40f6e4 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -1442,7 +1442,7 @@ def build_overview(): "decimals": 4, "text_mode": "value", "instant": True, - "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples.", + "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; missing slots after that first sample count as down.", }, { "id": 4, diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py index 162827aa..9a9903a3 100644 --- a/scripts/tests/test_dashboards_render_atlas.py +++ b/scripts/tests/test_dashboards_render_atlas.py @@ -59,7 +59,7 @@ def test_overview_availability_panel_uses_recorded_365d_rollup(): assert panel["targets"][0]["expr"] == 'last_over_time(atlas:availability:ratio_365d{scope="atlas"}[30m])' assert panel["targets"][0]["instant"] is True assert "precomputed" in panel["description"] - assert "pre-telemetry" in panel["description"] + assert "after that first sample count as down" in panel["description"] def test_render_configmap_writes(tmp_path): diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 373f456e..641dad19 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -283,7 +283,7 @@ }, "textMode": "value" }, - "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples." + "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; missing slots after that first sample count as down." }, { "id": 4, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 64d3bd88..4a5b9606 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -292,7 +292,7 @@ data: }, "textMode": "value" }, - "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples." + "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; missing slots after that first sample count as down." }, { "id": 4, diff --git a/services/monitoring/vmalert-atlas-availability.yaml b/services/monitoring/vmalert-atlas-availability.yaml index 272fe314..549ffcb1 100644 --- a/services/monitoring/vmalert-atlas-availability.yaml +++ b/services/monitoring/vmalert-atlas-availability.yaml @@ -30,23 +30,58 @@ data: rollup: hourly - record: atlas:availability:ratio_365d expr: | - ( + clamp_max(( ( - sum(sum_over_time(atlas:availability:ratio_1h{scope="atlas",rollup="hourly"}[365d])) + sum(sum_over_time(( + min( + ( + sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"}) + / 3 + ), + ( + sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"}) + / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1) + ) + ) + )[365d:1h])) or on() vector(0) ) + clamp_min( - 35040 + 8761 - ( - sum(count_over_time(atlas:availability:ratio_1h{scope="atlas",rollup="hourly"}[365d])) - or on() vector(0) + clamp_min( + floor( + ( + time() + - + ( + min(min_over_time(timestamp( + min( + ( + sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"}) + / 3 + ), + ( + sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"}) + / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1) + ) + ) + )[365d:1h])) + or on() vector(time() + 3600) + ) + ) + / 3600 + ) + + 1, + 0 + ) ), 0 ) ) - / 35040 + / 8761, 1) labels: scope: atlas rollup: yearly @@ -79,7 +114,7 @@ spec: labels: app: vmalert-atlas-availability annotations: - bstein.dev/rules-revision: "2026-05-10-availability-rollup-v4" + bstein.dev/rules-revision: "2026-05-10-availability-rollup-v5" spec: serviceAccountName: vmalert-atlas-availability affinity: