monitoring: count post-start availability gaps

This commit is contained in:
jenkins 2026-05-10 16:21:47 -03:00
parent e7213d9d1c
commit eb57c1fe0f
5 changed files with 46 additions and 11 deletions

View File

@ -1442,7 +1442,7 @@ def build_overview():
"decimals": 4,
"text_mode": "value",
"instant": True,
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples.",
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; missing slots after that first sample count as down.",
},
{
"id": 4,

View File

@ -59,7 +59,7 @@ def test_overview_availability_panel_uses_recorded_365d_rollup():
assert panel["targets"][0]["expr"] == 'last_over_time(atlas:availability:ratio_365d{scope="atlas"}[30m])'
assert panel["targets"][0]["instant"] is True
assert "precomputed" in panel["description"]
assert "pre-telemetry" in panel["description"]
assert "after that first sample count as down" in panel["description"]
def test_render_configmap_writes(tmp_path):

View File

@ -283,7 +283,7 @@
},
"textMode": "value"
},
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples."
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; missing slots after that first sample count as down."
},
{
"id": 4,

View File

@ -292,7 +292,7 @@ data:
},
"textMode": "value"
},
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples."
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; missing slots after that first sample count as down."
},
{
"id": 4,

View File

@ -30,23 +30,58 @@ data:
rollup: hourly
- record: atlas:availability:ratio_365d
expr: |
(
clamp_max((
(
sum(sum_over_time(atlas:availability:ratio_1h{scope="atlas",rollup="hourly"}[365d]))
sum(sum_over_time((
min(
(
sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})
/ 3
),
(
sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
/ clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
)
)
)[365d:1h]))
or on() vector(0)
)
+
clamp_min(
35040
8761
-
(
sum(count_over_time(atlas:availability:ratio_1h{scope="atlas",rollup="hourly"}[365d]))
or on() vector(0)
clamp_min(
floor(
(
time()
-
(
min(min_over_time(timestamp(
min(
(
sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})
/ 3
),
(
sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
/ clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
)
)
)[365d:1h]))
or on() vector(time() + 3600)
)
)
/ 3600
)
+ 1,
0
)
),
0
)
)
/ 35040
/ 8761, 1)
labels:
scope: atlas
rollup: yearly
@ -79,7 +114,7 @@ spec:
labels:
app: vmalert-atlas-availability
annotations:
bstein.dev/rules-revision: "2026-05-10-availability-rollup-v4"
bstein.dev/rules-revision: "2026-05-10-availability-rollup-v5"
spec:
serviceAccountName: vmalert-atlas-availability
affinity: