monitoring: count post-start availability gaps

This commit is contained in:
jenkins 2026-05-10 16:21:47 -03:00
parent e7213d9d1c
commit eb57c1fe0f
5 changed files with 46 additions and 11 deletions

View File

@ -1442,7 +1442,7 @@ def build_overview():
"decimals": 4, "decimals": 4,
"text_mode": "value", "text_mode": "value",
"instant": True, "instant": True,
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples.", "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; missing slots after that first sample count as down.",
}, },
{ {
"id": 4, "id": 4,

View File

@ -59,7 +59,7 @@ def test_overview_availability_panel_uses_recorded_365d_rollup():
assert panel["targets"][0]["expr"] == 'last_over_time(atlas:availability:ratio_365d{scope="atlas"}[30m])' assert panel["targets"][0]["expr"] == 'last_over_time(atlas:availability:ratio_365d{scope="atlas"}[30m])'
assert panel["targets"][0]["instant"] is True assert panel["targets"][0]["instant"] is True
assert "precomputed" in panel["description"] assert "precomputed" in panel["description"]
assert "pre-telemetry" in panel["description"] assert "after that first sample count as down" in panel["description"]
def test_render_configmap_writes(tmp_path): def test_render_configmap_writes(tmp_path):

View File

@ -283,7 +283,7 @@
}, },
"textMode": "value" "textMode": "value"
}, },
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples." "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; missing slots after that first sample count as down."
}, },
{ {
"id": 4, "id": 4,

View File

@ -292,7 +292,7 @@ data:
}, },
"textMode": "value" "textMode": "value"
}, },
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples." "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing slots before the first raw availability sample are filled as 100% up; missing slots after that first sample count as down."
}, },
{ {
"id": 4, "id": 4,

View File

@ -30,23 +30,58 @@ data:
rollup: hourly rollup: hourly
- record: atlas:availability:ratio_365d - record: atlas:availability:ratio_365d
expr: | expr: |
( clamp_max((
( (
sum(sum_over_time(atlas:availability:ratio_1h{scope="atlas",rollup="hourly"}[365d])) sum(sum_over_time((
min(
(
sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})
/ 3
),
(
sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
/ clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
)
)
)[365d:1h]))
or on() vector(0) or on() vector(0)
) )
+ +
clamp_min( clamp_min(
35040 8761
- -
( (
sum(count_over_time(atlas:availability:ratio_1h{scope="atlas",rollup="hourly"}[365d])) clamp_min(
or on() vector(0) floor(
(
time()
-
(
min(min_over_time(timestamp(
min(
(
sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})
/ 3
),
(
sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
/ clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
)
)
)[365d:1h]))
or on() vector(time() + 3600)
)
)
/ 3600
)
+ 1,
0
)
), ),
0 0
) )
) )
/ 35040 / 8761, 1)
labels: labels:
scope: atlas scope: atlas
rollup: yearly rollup: yearly
@ -79,7 +114,7 @@ spec:
labels: labels:
app: vmalert-atlas-availability app: vmalert-atlas-availability
annotations: annotations:
bstein.dev/rules-revision: "2026-05-10-availability-rollup-v4" bstein.dev/rules-revision: "2026-05-10-availability-rollup-v5"
spec: spec:
serviceAccountName: vmalert-atlas-availability serviceAccountName: vmalert-atlas-availability
affinity: affinity: