monitoring: precompute atlas availability rollup
This commit is contained in:
parent
32ffe30145
commit
7f7dde01de
@ -312,9 +312,9 @@ STUCK_TERMINATING_EXPR = (
|
|||||||
')) '
|
')) '
|
||||||
"or on() vector(0)"
|
"or on() vector(0)"
|
||||||
)
|
)
|
||||||
UPTIME_WINDOW = "30d"
|
UPTIME_WINDOW = "365d"
|
||||||
# Keep the subquery step coarse so we don't request an excessive number of points.
|
# vmalert precomputes the expensive long-window rollup so Grafana only reads one compact series.
|
||||||
UPTIME_STEP = "1h"
|
UPTIME_RECORDING_EXPR = f'atlas:availability:ratio_{UPTIME_WINDOW}{{scope="atlas"}}'
|
||||||
TRAEFIK_READY_EXPR = (
|
TRAEFIK_READY_EXPR = (
|
||||||
"("
|
"("
|
||||||
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
|
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
|
||||||
@ -335,7 +335,7 @@ NODE_TIEBREAKER = " + ".join(
|
|||||||
f"({node_filter(node)}) * 1e-6 * {idx}"
|
f"({node_filter(node)}) * 1e-6 * {idx}"
|
||||||
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
|
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
|
||||||
)
|
)
|
||||||
UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:{UPTIME_STEP}])"
|
UPTIME_AVG_EXPR = UPTIME_RECORDING_EXPR
|
||||||
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
|
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
|
||||||
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
|
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
|
||||||
UPTIME_THRESHOLDS = {
|
UPTIME_THRESHOLDS = {
|
||||||
@ -1433,7 +1433,7 @@ def build_overview():
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 27,
|
"id": 27,
|
||||||
"title": "Atlas Availability (30d)",
|
"title": "Atlas Availability (365d)",
|
||||||
"expr": UPTIME_PERCENT_EXPR,
|
"expr": UPTIME_PERCENT_EXPR,
|
||||||
"kind": "stat",
|
"kind": "stat",
|
||||||
"thresholds": UPTIME_PERCENT_THRESHOLDS,
|
"thresholds": UPTIME_PERCENT_THRESHOLDS,
|
||||||
@ -1441,7 +1441,7 @@ def build_overview():
|
|||||||
"decimals": 4,
|
"decimals": 4,
|
||||||
"text_mode": "value",
|
"text_mode": "value",
|
||||||
"instant": True,
|
"instant": True,
|
||||||
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime.",
|
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load.",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
|
|||||||
@ -50,16 +50,15 @@ def test_node_filter_and_expr_helpers():
|
|||||||
assert "node_memory_MemAvailable_bytes" in mem_expr
|
assert "node_memory_MemAvailable_bytes" in mem_expr
|
||||||
|
|
||||||
|
|
||||||
def test_overview_availability_panel_is_recent_and_instant():
|
def test_overview_availability_panel_uses_recorded_365d_rollup():
|
||||||
mod = load_module()
|
mod = load_module()
|
||||||
dashboard = mod.build_overview()
|
dashboard = mod.build_overview()
|
||||||
panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27)
|
panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27)
|
||||||
|
|
||||||
assert panel["title"] == "Atlas Availability (30d)"
|
assert panel["title"] == "Atlas Availability (365d)"
|
||||||
assert "[30d:1h]" in panel["targets"][0]["expr"]
|
assert panel["targets"][0]["expr"] == 'atlas:availability:ratio_365d{scope="atlas"}'
|
||||||
assert "365d" not in panel["targets"][0]["expr"]
|
|
||||||
assert panel["targets"][0]["instant"] is True
|
assert panel["targets"][0]["instant"] is True
|
||||||
assert "pre-metric history" in panel["description"]
|
assert "precomputed" in panel["description"]
|
||||||
|
|
||||||
|
|
||||||
def test_render_configmap_writes(tmp_path):
|
def test_render_configmap_writes(tmp_path):
|
||||||
|
|||||||
@ -213,7 +213,7 @@
|
|||||||
{
|
{
|
||||||
"id": 27,
|
"id": 27,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Atlas Availability (30d)",
|
"title": "Atlas Availability (365d)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -226,7 +226,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])",
|
"expr": "atlas:availability:ratio_365d{scope=\"atlas\"}",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -283,7 +283,7 @@
|
|||||||
},
|
},
|
||||||
"textMode": "value"
|
"textMode": "value"
|
||||||
},
|
},
|
||||||
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime."
|
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
|
|||||||
@ -222,7 +222,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 27,
|
"id": 27,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Atlas Availability (30d)",
|
"title": "Atlas Availability (365d)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -235,7 +235,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])",
|
"expr": "atlas:availability:ratio_365d{scope=\"atlas\"}",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -292,7 +292,7 @@ data:
|
|||||||
},
|
},
|
||||||
"textMode": "value"
|
"textMode": "value"
|
||||||
},
|
},
|
||||||
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime."
|
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
|
|||||||
@ -17,6 +17,7 @@ resources:
|
|||||||
- grafana-dashboard-mail.yaml
|
- grafana-dashboard-mail.yaml
|
||||||
- grafana-dashboard-jobs.yaml
|
- grafana-dashboard-jobs.yaml
|
||||||
- grafana-dashboard-testing.yaml
|
- grafana-dashboard-testing.yaml
|
||||||
|
- vmalert-atlas-availability.yaml
|
||||||
- dcgm-exporter.yaml
|
- dcgm-exporter.yaml
|
||||||
- jetson-tegrastats-exporter.yaml
|
- jetson-tegrastats-exporter.yaml
|
||||||
- postmark-exporter-service.yaml
|
- postmark-exporter-service.yaml
|
||||||
|
|||||||
141
services/monitoring/vmalert-atlas-availability.yaml
Normal file
141
services/monitoring/vmalert-atlas-availability.yaml
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
# services/monitoring/vmalert-atlas-availability.yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: vmalert-atlas-availability-rules
|
||||||
|
namespace: monitoring
|
||||||
|
data:
|
||||||
|
atlas-availability.yaml: |
|
||||||
|
groups:
|
||||||
|
- name: atlas.availability
|
||||||
|
interval: 15m
|
||||||
|
rules:
|
||||||
|
- record: atlas:availability:ratio_1h
|
||||||
|
expr: |
|
||||||
|
avg_over_time((
|
||||||
|
min(
|
||||||
|
(
|
||||||
|
sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})
|
||||||
|
/ 3
|
||||||
|
),
|
||||||
|
(
|
||||||
|
sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
|
||||||
|
/ clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)[1h:5m])
|
||||||
|
labels:
|
||||||
|
scope: atlas
|
||||||
|
rollup: hourly
|
||||||
|
- record: atlas:availability:ratio_365d
|
||||||
|
expr: |
|
||||||
|
avg_over_time((
|
||||||
|
min(
|
||||||
|
(
|
||||||
|
sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})
|
||||||
|
/ 3
|
||||||
|
),
|
||||||
|
(
|
||||||
|
sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
|
||||||
|
/ clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)[365d:6h])
|
||||||
|
labels:
|
||||||
|
scope: atlas
|
||||||
|
rollup: yearly
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: vmalert-atlas-availability
|
||||||
|
namespace: monitoring
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: vmalert-atlas-availability
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
app: vmalert-atlas-availability
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
revisionHistoryLimit: 3
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: vmalert-atlas-availability
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: vmalert-atlas-availability
|
||||||
|
spec:
|
||||||
|
serviceAccountName: vmalert-atlas-availability
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: kubernetes.io/hostname
|
||||||
|
operator: NotIn
|
||||||
|
values:
|
||||||
|
- titan-22
|
||||||
|
- titan-24
|
||||||
|
containers:
|
||||||
|
- name: vmalert
|
||||||
|
image: victoriametrics/vmalert:v1.113.0
|
||||||
|
args:
|
||||||
|
- -datasource.url=http://victoria-metrics-single-server:8428
|
||||||
|
- -remoteWrite.url=http://victoria-metrics-single-server:8428/api/v1/write
|
||||||
|
- -rule=/etc/vmalert/rules/*.yaml
|
||||||
|
- -evaluationInterval=15m
|
||||||
|
- -httpListenAddr=:8880
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
containerPort: 8880
|
||||||
|
readinessProbe:
|
||||||
|
tcpSocket:
|
||||||
|
port: http
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
livenessProbe:
|
||||||
|
tcpSocket:
|
||||||
|
port: http
|
||||||
|
initialDelaySeconds: 20
|
||||||
|
periodSeconds: 30
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 25m
|
||||||
|
memory: 64Mi
|
||||||
|
limits:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 256Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: rules
|
||||||
|
mountPath: /etc/vmalert/rules
|
||||||
|
readOnly: true
|
||||||
|
volumes:
|
||||||
|
- name: rules
|
||||||
|
configMap:
|
||||||
|
name: vmalert-atlas-availability-rules
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: vmalert-atlas-availability
|
||||||
|
namespace: monitoring
|
||||||
|
annotations:
|
||||||
|
prometheus.io/scrape: "true"
|
||||||
|
prometheus.io/port: "8880"
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: vmalert-atlas-availability
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 8880
|
||||||
|
targetPort: http
|
||||||
Loading…
x
Reference in New Issue
Block a user