monitoring: precompute atlas availability rollup

This commit is contained in:
jenkins 2026-05-10 15:40:12 -03:00
parent 32ffe30145
commit 7f7dde01de
6 changed files with 158 additions and 17 deletions

View File

@ -312,9 +312,9 @@ STUCK_TERMINATING_EXPR = (
')) '
"or on() vector(0)"
)
UPTIME_WINDOW = "30d"
# Keep the subquery step coarse so we don't request an excessive number of points.
UPTIME_STEP = "1h"
UPTIME_WINDOW = "365d"
# vmalert precomputes the expensive long-window rollup so Grafana only reads one compact series.
UPTIME_RECORDING_EXPR = f'atlas:availability:ratio_{UPTIME_WINDOW}{{scope="atlas"}}'
TRAEFIK_READY_EXPR = (
"("
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
@ -335,7 +335,7 @@ NODE_TIEBREAKER = " + ".join(
f"({node_filter(node)}) * 1e-6 * {idx}"
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
)
UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:{UPTIME_STEP}])"
UPTIME_AVG_EXPR = UPTIME_RECORDING_EXPR
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
UPTIME_THRESHOLDS = {
@ -1433,7 +1433,7 @@ def build_overview():
},
{
"id": 27,
"title": "Atlas Availability (30d)",
"title": "Atlas Availability (365d)",
"expr": UPTIME_PERCENT_EXPR,
"kind": "stat",
"thresholds": UPTIME_PERCENT_THRESHOLDS,
@ -1441,7 +1441,7 @@ def build_overview():
"decimals": 4,
"text_mode": "value",
"instant": True,
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime.",
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load.",
},
{
"id": 4,

View File

@ -50,16 +50,15 @@ def test_node_filter_and_expr_helpers():
assert "node_memory_MemAvailable_bytes" in mem_expr
def test_overview_availability_panel_is_recent_and_instant():
def test_overview_availability_panel_uses_recorded_365d_rollup():
mod = load_module()
dashboard = mod.build_overview()
panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27)
assert panel["title"] == "Atlas Availability (30d)"
assert "[30d:1h]" in panel["targets"][0]["expr"]
assert "365d" not in panel["targets"][0]["expr"]
assert panel["title"] == "Atlas Availability (365d)"
assert panel["targets"][0]["expr"] == 'atlas:availability:ratio_365d{scope="atlas"}'
assert panel["targets"][0]["instant"] is True
assert "pre-metric history" in panel["description"]
assert "precomputed" in panel["description"]
def test_render_configmap_writes(tmp_path):

View File

@ -213,7 +213,7 @@
{
"id": 27,
"type": "stat",
"title": "Atlas Availability (30d)",
"title": "Atlas Availability (365d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -226,7 +226,7 @@
},
"targets": [
{
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])",
"expr": "atlas:availability:ratio_365d{scope=\"atlas\"}",
"refId": "A",
"instant": true
}
@ -283,7 +283,7 @@
},
"textMode": "value"
},
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime."
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load."
},
{
"id": 4,

View File

@ -222,7 +222,7 @@ data:
{
"id": 27,
"type": "stat",
"title": "Atlas Availability (30d)",
"title": "Atlas Availability (365d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -235,7 +235,7 @@ data:
},
"targets": [
{
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])",
"expr": "atlas:availability:ratio_365d{scope=\"atlas\"}",
"refId": "A",
"instant": true
}
@ -292,7 +292,7 @@ data:
},
"textMode": "value"
},
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime."
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load."
},
{
"id": 4,

View File

@ -17,6 +17,7 @@ resources:
- grafana-dashboard-mail.yaml
- grafana-dashboard-jobs.yaml
- grafana-dashboard-testing.yaml
- vmalert-atlas-availability.yaml
- dcgm-exporter.yaml
- jetson-tegrastats-exporter.yaml
- postmark-exporter-service.yaml

View File

@ -0,0 +1,141 @@
# services/monitoring/vmalert-atlas-availability.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: vmalert-atlas-availability-rules
namespace: monitoring
data:
atlas-availability.yaml: |
groups:
- name: atlas.availability
interval: 15m
rules:
- record: atlas:availability:ratio_1h
expr: |
avg_over_time((
min(
(
sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})
/ 3
),
(
sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
/ clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
)
)
)[1h:5m])
labels:
scope: atlas
rollup: hourly
- record: atlas:availability:ratio_365d
expr: |
avg_over_time((
min(
(
sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})
/ 3
),
(
sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
/ clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
)
)
)[365d:6h])
labels:
scope: atlas
rollup: yearly
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: vmalert-atlas-availability
namespace: monitoring
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vmalert-atlas-availability
namespace: monitoring
labels:
app: vmalert-atlas-availability
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: vmalert-atlas-availability
template:
metadata:
labels:
app: vmalert-atlas-availability
spec:
serviceAccountName: vmalert-atlas-availability
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-22
- titan-24
containers:
- name: vmalert
image: victoriametrics/vmalert:v1.113.0
args:
- -datasource.url=http://victoria-metrics-single-server:8428
- -remoteWrite.url=http://victoria-metrics-single-server:8428/api/v1/write
- -rule=/etc/vmalert/rules/*.yaml
- -evaluationInterval=15m
- -httpListenAddr=:8880
ports:
- name: http
containerPort: 8880
readinessProbe:
tcpSocket:
port: http
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
tcpSocket:
port: http
initialDelaySeconds: 20
periodSeconds: 30
resources:
requests:
cpu: 25m
memory: 64Mi
limits:
cpu: 500m
memory: 256Mi
volumeMounts:
- name: rules
mountPath: /etc/vmalert/rules
readOnly: true
volumes:
- name: rules
configMap:
name: vmalert-atlas-availability-rules
---
apiVersion: v1
kind: Service
metadata:
name: vmalert-atlas-availability
namespace: monitoring
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8880"
spec:
selector:
app: vmalert-atlas-availability
ports:
- name: http
port: 8880
targetPort: http