diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index bd40f6e4..a2c61604 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -1442,7 +1442,7 @@ def build_overview(): "decimals": 4, "text_mode": "value", "instant": True, - "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; missing slots after that first sample count as down.", + "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; observed down samples count as down, while scrape gaps are ignored.", }, { "id": 4, diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py index 9a9903a3..8e78bfbb 100644 --- a/scripts/tests/test_dashboards_render_atlas.py +++ b/scripts/tests/test_dashboards_render_atlas.py @@ -59,7 +59,7 @@ def test_overview_availability_panel_uses_recorded_365d_rollup(): assert panel["targets"][0]["expr"] == 'last_over_time(atlas:availability:ratio_365d{scope="atlas"}[30m])' assert panel["targets"][0]["instant"] is True assert "precomputed" in panel["description"] - assert "after that first sample count as down" in panel["description"] + assert "scrape gaps are ignored" in panel["description"] def test_render_configmap_writes(tmp_path): diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 641dad19..9df0eb83 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -283,7 +283,7 @@ }, "textMode": "value" }, - "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; missing slots after that first sample count as down." + "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; observed down samples count as down, while scrape gaps are ignored." }, { "id": 4, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 4a5b9606..3013cd35 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -292,7 +292,7 @@ data: }, "textMode": "value" }, - "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; missing slots after that first sample count as down." + "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; observed down samples count as down, while scrape gaps are ignored." }, { "id": 4, diff --git a/services/monitoring/vmalert-atlas-availability.yaml b/services/monitoring/vmalert-atlas-availability.yaml index 549ffcb1..09a5cf29 100644 --- a/services/monitoring/vmalert-atlas-availability.yaml +++ b/services/monitoring/vmalert-atlas-availability.yaml @@ -81,7 +81,61 @@ data: 0 ) ) - / 8761, 1) + / + clamp_min( + ( + ( + sum(count_over_time(( + min( + ( + sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"}) + / 3 + ), + ( + sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"}) + / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1) + ) + ) + )[365d:1h])) + or on() vector(0) + ) + + + clamp_min( + 8761 + - + ( + clamp_min( + floor( + ( + time() + - + ( + min(min_over_time(timestamp( + min( + ( + sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"}) + / 3 + ), + ( + sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"}) + / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1) + ) + ) + )[365d:1h])) + or on() vector(time() + 3600) + ) + ) + / 3600 + ) + + 1, + 0 + ) + ), + 0 + ) + ), + 1 + ), 1) labels: scope: atlas rollup: yearly @@ -114,7 +168,7 @@ spec: labels: app: vmalert-atlas-availability annotations: - bstein.dev/rules-revision: "2026-05-10-availability-rollup-v5" + bstein.dev/rules-revision: "2026-05-10-availability-rollup-v6" spec: serviceAccountName: vmalert-atlas-availability affinity: