monitoring: fill pre-telemetry availability

This commit is contained in:
jenkins 2026-05-10 16:13:13 -03:00
parent 7b656dbaeb
commit e7213d9d1c
5 changed files with 27 additions and 20 deletions

View File

@ -314,7 +314,8 @@ STUCK_TERMINATING_EXPR = (
) )
UPTIME_WINDOW = "365d" UPTIME_WINDOW = "365d"
# vmalert precomputes the expensive long-window rollup so Grafana only reads one compact series. # vmalert precomputes the expensive long-window rollup so Grafana only reads one compact series.
UPTIME_RECORDING_EXPR = f'atlas:availability:ratio_{UPTIME_WINDOW}{{scope="atlas"}}' UPTIME_RECORDING_METRIC = f'atlas:availability:ratio_{UPTIME_WINDOW}{{scope="atlas"}}'
UPTIME_RECORDING_EXPR = f"last_over_time({UPTIME_RECORDING_METRIC}[30m])"
TRAEFIK_READY_EXPR = ( TRAEFIK_READY_EXPR = (
"(" "("
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})' 'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
@ -1441,7 +1442,7 @@ def build_overview():
"decimals": 4, "decimals": 4,
"text_mode": "value", "text_mode": "value",
"instant": True, "instant": True,
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load.", "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples.",
}, },
{ {
"id": 4, "id": 4,

View File

@ -56,9 +56,10 @@ def test_overview_availability_panel_uses_recorded_365d_rollup():
panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27) panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27)
assert panel["title"] == "Atlas Availability (365d)" assert panel["title"] == "Atlas Availability (365d)"
assert panel["targets"][0]["expr"] == 'atlas:availability:ratio_365d{scope="atlas"}' assert panel["targets"][0]["expr"] == 'last_over_time(atlas:availability:ratio_365d{scope="atlas"}[30m])'
assert panel["targets"][0]["instant"] is True assert panel["targets"][0]["instant"] is True
assert "precomputed" in panel["description"] assert "precomputed" in panel["description"]
assert "pre-telemetry" in panel["description"]
def test_render_configmap_writes(tmp_path): def test_render_configmap_writes(tmp_path):

View File

@ -226,7 +226,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "atlas:availability:ratio_365d{scope=\"atlas\"}", "expr": "last_over_time(atlas:availability:ratio_365d{scope=\"atlas\"}[30m])",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }
@ -283,7 +283,7 @@
}, },
"textMode": "value" "textMode": "value"
}, },
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load." "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples."
}, },
{ {
"id": 4, "id": 4,

View File

@ -235,7 +235,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "atlas:availability:ratio_365d{scope=\"atlas\"}", "expr": "last_over_time(atlas:availability:ratio_365d{scope=\"atlas\"}[30m])",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }
@ -292,7 +292,7 @@ data:
}, },
"textMode": "value" "textMode": "value"
}, },
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load." "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples."
}, },
{ {
"id": 4, "id": 4,

View File

@ -9,7 +9,7 @@ data:
groups: groups:
- name: atlas.availability - name: atlas.availability
interval: 15m interval: 15m
eval_offset: 7m eval_offset: 14m
rules: rules:
- record: atlas:availability:ratio_1h - record: atlas:availability:ratio_1h
expr: | expr: |
@ -30,18 +30,23 @@ data:
rollup: hourly rollup: hourly
- record: atlas:availability:ratio_365d - record: atlas:availability:ratio_365d
expr: | expr: |
avg_over_time(( (
min( (
( sum(sum_over_time(atlas:availability:ratio_1h{scope="atlas",rollup="hourly"}[365d]))
sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"}) or on() vector(0)
/ 3
),
(
sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
/ clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
)
) )
)[365d:6h]) +
clamp_min(
35040
-
(
sum(count_over_time(atlas:availability:ratio_1h{scope="atlas",rollup="hourly"}[365d]))
or on() vector(0)
),
0
)
)
/ 35040
labels: labels:
scope: atlas scope: atlas
rollup: yearly rollup: yearly
@ -74,7 +79,7 @@ spec:
labels: labels:
app: vmalert-atlas-availability app: vmalert-atlas-availability
annotations: annotations:
bstein.dev/rules-revision: "2026-05-10-availability-rollup-v2" bstein.dev/rules-revision: "2026-05-10-availability-rollup-v4"
spec: spec:
serviceAccountName: vmalert-atlas-availability serviceAccountName: vmalert-atlas-availability
affinity: affinity: