monitoring: fill pre-telemetry availability
This commit is contained in:
parent
7b656dbaeb
commit
e7213d9d1c
@ -314,7 +314,8 @@ STUCK_TERMINATING_EXPR = (
|
|||||||
)
|
)
|
||||||
UPTIME_WINDOW = "365d"
|
UPTIME_WINDOW = "365d"
|
||||||
# vmalert precomputes the expensive long-window rollup so Grafana only reads one compact series.
|
# vmalert precomputes the expensive long-window rollup so Grafana only reads one compact series.
|
||||||
UPTIME_RECORDING_EXPR = f'atlas:availability:ratio_{UPTIME_WINDOW}{{scope="atlas"}}'
|
UPTIME_RECORDING_METRIC = f'atlas:availability:ratio_{UPTIME_WINDOW}{{scope="atlas"}}'
|
||||||
|
UPTIME_RECORDING_EXPR = f"last_over_time({UPTIME_RECORDING_METRIC}[30m])"
|
||||||
TRAEFIK_READY_EXPR = (
|
TRAEFIK_READY_EXPR = (
|
||||||
"("
|
"("
|
||||||
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
|
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
|
||||||
@ -1441,7 +1442,7 @@ def build_overview():
|
|||||||
"decimals": 4,
|
"decimals": 4,
|
||||||
"text_mode": "value",
|
"text_mode": "value",
|
||||||
"instant": True,
|
"instant": True,
|
||||||
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load.",
|
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples.",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
|
|||||||
@ -56,9 +56,10 @@ def test_overview_availability_panel_uses_recorded_365d_rollup():
|
|||||||
panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27)
|
panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27)
|
||||||
|
|
||||||
assert panel["title"] == "Atlas Availability (365d)"
|
assert panel["title"] == "Atlas Availability (365d)"
|
||||||
assert panel["targets"][0]["expr"] == 'atlas:availability:ratio_365d{scope="atlas"}'
|
assert panel["targets"][0]["expr"] == 'last_over_time(atlas:availability:ratio_365d{scope="atlas"}[30m])'
|
||||||
assert panel["targets"][0]["instant"] is True
|
assert panel["targets"][0]["instant"] is True
|
||||||
assert "precomputed" in panel["description"]
|
assert "precomputed" in panel["description"]
|
||||||
|
assert "pre-telemetry" in panel["description"]
|
||||||
|
|
||||||
|
|
||||||
def test_render_configmap_writes(tmp_path):
|
def test_render_configmap_writes(tmp_path):
|
||||||
|
|||||||
@ -226,7 +226,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "atlas:availability:ratio_365d{scope=\"atlas\"}",
|
"expr": "last_over_time(atlas:availability:ratio_365d{scope=\"atlas\"}[30m])",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -283,7 +283,7 @@
|
|||||||
},
|
},
|
||||||
"textMode": "value"
|
"textMode": "value"
|
||||||
},
|
},
|
||||||
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load."
|
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
|
|||||||
@ -235,7 +235,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "atlas:availability:ratio_365d{scope=\"atlas\"}",
|
"expr": "last_over_time(atlas:availability:ratio_365d{scope=\"atlas\"}[30m])",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -292,7 +292,7 @@ data:
|
|||||||
},
|
},
|
||||||
"textMode": "value"
|
"textMode": "value"
|
||||||
},
|
},
|
||||||
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load."
|
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Missing pre-telemetry slots are filled as 100% up, and Grafana reads the latest recorded value instead of recomputing a year of raw samples."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
|
|||||||
@ -9,7 +9,7 @@ data:
|
|||||||
groups:
|
groups:
|
||||||
- name: atlas.availability
|
- name: atlas.availability
|
||||||
interval: 15m
|
interval: 15m
|
||||||
eval_offset: 7m
|
eval_offset: 14m
|
||||||
rules:
|
rules:
|
||||||
- record: atlas:availability:ratio_1h
|
- record: atlas:availability:ratio_1h
|
||||||
expr: |
|
expr: |
|
||||||
@ -30,18 +30,23 @@ data:
|
|||||||
rollup: hourly
|
rollup: hourly
|
||||||
- record: atlas:availability:ratio_365d
|
- record: atlas:availability:ratio_365d
|
||||||
expr: |
|
expr: |
|
||||||
avg_over_time((
|
(
|
||||||
min(
|
(
|
||||||
(
|
sum(sum_over_time(atlas:availability:ratio_1h{scope="atlas",rollup="hourly"}[365d]))
|
||||||
sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})
|
or on() vector(0)
|
||||||
/ 3
|
|
||||||
),
|
|
||||||
(
|
|
||||||
sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
|
|
||||||
/ clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
)[365d:6h])
|
+
|
||||||
|
clamp_min(
|
||||||
|
35040
|
||||||
|
-
|
||||||
|
(
|
||||||
|
sum(count_over_time(atlas:availability:ratio_1h{scope="atlas",rollup="hourly"}[365d]))
|
||||||
|
or on() vector(0)
|
||||||
|
),
|
||||||
|
0
|
||||||
|
)
|
||||||
|
)
|
||||||
|
/ 35040
|
||||||
labels:
|
labels:
|
||||||
scope: atlas
|
scope: atlas
|
||||||
rollup: yearly
|
rollup: yearly
|
||||||
@ -74,7 +79,7 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
app: vmalert-atlas-availability
|
app: vmalert-atlas-availability
|
||||||
annotations:
|
annotations:
|
||||||
bstein.dev/rules-revision: "2026-05-10-availability-rollup-v2"
|
bstein.dev/rules-revision: "2026-05-10-availability-rollup-v4"
|
||||||
spec:
|
spec:
|
||||||
serviceAccountName: vmalert-atlas-availability
|
serviceAccountName: vmalert-atlas-availability
|
||||||
affinity:
|
affinity:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user