monitoring: precompute atlas availability rollup
This commit is contained in:
parent
32ffe30145
commit
7f7dde01de
@ -312,9 +312,9 @@ STUCK_TERMINATING_EXPR = (
|
||||
')) '
|
||||
"or on() vector(0)"
|
||||
)
|
||||
UPTIME_WINDOW = "30d"
|
||||
# Keep the subquery step coarse so we don't request an excessive number of points.
|
||||
UPTIME_STEP = "1h"
|
||||
UPTIME_WINDOW = "365d"
|
||||
# vmalert precomputes the expensive long-window rollup so Grafana only reads one compact series.
|
||||
UPTIME_RECORDING_EXPR = f'atlas:availability:ratio_{UPTIME_WINDOW}{{scope="atlas"}}'
|
||||
TRAEFIK_READY_EXPR = (
|
||||
"("
|
||||
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
|
||||
@ -335,7 +335,7 @@ NODE_TIEBREAKER = " + ".join(
|
||||
f"({node_filter(node)}) * 1e-6 * {idx}"
|
||||
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
|
||||
)
|
||||
UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:{UPTIME_STEP}])"
|
||||
UPTIME_AVG_EXPR = UPTIME_RECORDING_EXPR
|
||||
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
|
||||
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
|
||||
UPTIME_THRESHOLDS = {
|
||||
@ -1433,7 +1433,7 @@ def build_overview():
|
||||
},
|
||||
{
|
||||
"id": 27,
|
||||
"title": "Atlas Availability (30d)",
|
||||
"title": "Atlas Availability (365d)",
|
||||
"expr": UPTIME_PERCENT_EXPR,
|
||||
"kind": "stat",
|
||||
"thresholds": UPTIME_PERCENT_THRESHOLDS,
|
||||
@ -1441,7 +1441,7 @@ def build_overview():
|
||||
"decimals": 4,
|
||||
"text_mode": "value",
|
||||
"instant": True,
|
||||
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime.",
|
||||
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load.",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
|
||||
@ -50,16 +50,15 @@ def test_node_filter_and_expr_helpers():
|
||||
assert "node_memory_MemAvailable_bytes" in mem_expr
|
||||
|
||||
|
||||
def test_overview_availability_panel_is_recent_and_instant():
|
||||
def test_overview_availability_panel_uses_recorded_365d_rollup():
|
||||
mod = load_module()
|
||||
dashboard = mod.build_overview()
|
||||
panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27)
|
||||
|
||||
assert panel["title"] == "Atlas Availability (30d)"
|
||||
assert "[30d:1h]" in panel["targets"][0]["expr"]
|
||||
assert "365d" not in panel["targets"][0]["expr"]
|
||||
assert panel["title"] == "Atlas Availability (365d)"
|
||||
assert panel["targets"][0]["expr"] == 'atlas:availability:ratio_365d{scope="atlas"}'
|
||||
assert panel["targets"][0]["instant"] is True
|
||||
assert "pre-metric history" in panel["description"]
|
||||
assert "precomputed" in panel["description"]
|
||||
|
||||
|
||||
def test_render_configmap_writes(tmp_path):
|
||||
|
||||
@ -213,7 +213,7 @@
|
||||
{
|
||||
"id": 27,
|
||||
"type": "stat",
|
||||
"title": "Atlas Availability (30d)",
|
||||
"title": "Atlas Availability (365d)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -226,7 +226,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])",
|
||||
"expr": "atlas:availability:ratio_365d{scope=\"atlas\"}",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -283,7 +283,7 @@
|
||||
},
|
||||
"textMode": "value"
|
||||
},
|
||||
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime."
|
||||
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load."
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
|
||||
@ -222,7 +222,7 @@ data:
|
||||
{
|
||||
"id": 27,
|
||||
"type": "stat",
|
||||
"title": "Atlas Availability (30d)",
|
||||
"title": "Atlas Availability (365d)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -235,7 +235,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])",
|
||||
"expr": "atlas:availability:ratio_365d{scope=\"atlas\"}",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -292,7 +292,7 @@ data:
|
||||
},
|
||||
"textMode": "value"
|
||||
},
|
||||
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime."
|
||||
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load."
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
|
||||
@ -17,6 +17,7 @@ resources:
|
||||
- grafana-dashboard-mail.yaml
|
||||
- grafana-dashboard-jobs.yaml
|
||||
- grafana-dashboard-testing.yaml
|
||||
- vmalert-atlas-availability.yaml
|
||||
- dcgm-exporter.yaml
|
||||
- jetson-tegrastats-exporter.yaml
|
||||
- postmark-exporter-service.yaml
|
||||
|
||||
141
services/monitoring/vmalert-atlas-availability.yaml
Normal file
141
services/monitoring/vmalert-atlas-availability.yaml
Normal file
@ -0,0 +1,141 @@
|
||||
# services/monitoring/vmalert-atlas-availability.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: vmalert-atlas-availability-rules
|
||||
namespace: monitoring
|
||||
data:
|
||||
atlas-availability.yaml: |
|
||||
groups:
|
||||
- name: atlas.availability
|
||||
interval: 15m
|
||||
rules:
|
||||
- record: atlas:availability:ratio_1h
|
||||
expr: |
|
||||
avg_over_time((
|
||||
min(
|
||||
(
|
||||
sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})
|
||||
/ 3
|
||||
),
|
||||
(
|
||||
sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
|
||||
/ clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
|
||||
)
|
||||
)
|
||||
)[1h:5m])
|
||||
labels:
|
||||
scope: atlas
|
||||
rollup: hourly
|
||||
- record: atlas:availability:ratio_365d
|
||||
expr: |
|
||||
avg_over_time((
|
||||
min(
|
||||
(
|
||||
sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})
|
||||
/ 3
|
||||
),
|
||||
(
|
||||
sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
|
||||
/ clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
|
||||
)
|
||||
)
|
||||
)[365d:6h])
|
||||
labels:
|
||||
scope: atlas
|
||||
rollup: yearly
|
||||
|
||||
---
|
||||
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: vmalert-atlas-availability
|
||||
namespace: monitoring
|
||||
|
||||
---
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: vmalert-atlas-availability
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: vmalert-atlas-availability
|
||||
spec:
|
||||
replicas: 1
|
||||
revisionHistoryLimit: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: vmalert-atlas-availability
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: vmalert-atlas-availability
|
||||
spec:
|
||||
serviceAccountName: vmalert-atlas-availability
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/hostname
|
||||
operator: NotIn
|
||||
values:
|
||||
- titan-22
|
||||
- titan-24
|
||||
containers:
|
||||
- name: vmalert
|
||||
image: victoriametrics/vmalert:v1.113.0
|
||||
args:
|
||||
- -datasource.url=http://victoria-metrics-single-server:8428
|
||||
- -remoteWrite.url=http://victoria-metrics-single-server:8428/api/v1/write
|
||||
- -rule=/etc/vmalert/rules/*.yaml
|
||||
- -evaluationInterval=15m
|
||||
- -httpListenAddr=:8880
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8880
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: http
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: http
|
||||
initialDelaySeconds: 20
|
||||
periodSeconds: 30
|
||||
resources:
|
||||
requests:
|
||||
cpu: 25m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 256Mi
|
||||
volumeMounts:
|
||||
- name: rules
|
||||
mountPath: /etc/vmalert/rules
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: rules
|
||||
configMap:
|
||||
name: vmalert-atlas-availability-rules
|
||||
|
||||
---
|
||||
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vmalert-atlas-availability
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8880"
|
||||
spec:
|
||||
selector:
|
||||
app: vmalert-atlas-availability
|
||||
ports:
|
||||
- name: http
|
||||
port: 8880
|
||||
targetPort: http
|
||||
Loading…
x
Reference in New Issue
Block a user