diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index e97851a7..45bc9dad 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -312,9 +312,9 @@ STUCK_TERMINATING_EXPR = ( ')) ' "or on() vector(0)" ) -UPTIME_WINDOW = "30d" -# Keep the subquery step coarse so we don't request an excessive number of points. -UPTIME_STEP = "1h" +UPTIME_WINDOW = "365d" +# vmalert precomputes the expensive long-window rollup so Grafana only reads one compact series. +UPTIME_RECORDING_EXPR = f'atlas:availability:ratio_{UPTIME_WINDOW}{{scope="atlas"}}' TRAEFIK_READY_EXPR = ( "(" 'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})' @@ -335,7 +335,7 @@ NODE_TIEBREAKER = " + ".join( f"({node_filter(node)}) * 1e-6 * {idx}" for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1) ) -UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:{UPTIME_STEP}])" +UPTIME_AVG_EXPR = UPTIME_RECORDING_EXPR UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))" UPTIME_THRESHOLDS = { @@ -1433,7 +1433,7 @@ def build_overview(): }, { "id": 27, - "title": "Atlas Availability (30d)", + "title": "Atlas Availability (365d)", "expr": UPTIME_PERCENT_EXPR, "kind": "stat", "thresholds": UPTIME_PERCENT_THRESHOLDS, @@ -1441,7 +1441,7 @@ def build_overview(): "decimals": 4, "text_mode": "value", "instant": True, - "description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime.", + "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load.", }, { "id": 4, diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py index 4cc71938..2c806623 100644 --- a/scripts/tests/test_dashboards_render_atlas.py +++ b/scripts/tests/test_dashboards_render_atlas.py @@ -50,16 +50,15 @@ def test_node_filter_and_expr_helpers(): assert "node_memory_MemAvailable_bytes" in mem_expr -def test_overview_availability_panel_is_recent_and_instant(): +def test_overview_availability_panel_uses_recorded_365d_rollup(): mod = load_module() dashboard = mod.build_overview() panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27) - assert panel["title"] == "Atlas Availability (30d)" - assert "[30d:1h]" in panel["targets"][0]["expr"] - assert "365d" not in panel["targets"][0]["expr"] + assert panel["title"] == "Atlas Availability (365d)" + assert panel["targets"][0]["expr"] == 'atlas:availability:ratio_365d{scope="atlas"}' assert panel["targets"][0]["instant"] is True - assert "pre-metric history" in panel["description"] + assert "precomputed" in panel["description"] def test_render_configmap_writes(tmp_path): diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 933bc621..e9226de3 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -213,7 +213,7 @@ { "id": 27, "type": "stat", - "title": "Atlas Availability (30d)", + "title": "Atlas Availability (365d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -226,7 +226,7 @@ }, "targets": [ { - "expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])", + "expr": "atlas:availability:ratio_365d{scope=\"atlas\"}", "refId": "A", "instant": true } @@ -283,7 +283,7 @@ }, "textMode": "value" }, - "description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime." + "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load." }, { "id": 4, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index c7aab46e..c28addae 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -222,7 +222,7 @@ data: { "id": 27, "type": "stat", - "title": "Atlas Availability (30d)", + "title": "Atlas Availability (365d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -235,7 +235,7 @@ data: }, "targets": [ { - "expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])", + "expr": "atlas:availability:ratio_365d{scope=\"atlas\"}", "refId": "A", "instant": true } @@ -292,7 +292,7 @@ data: }, "textMode": "value" }, - "description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime." + "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load." }, { "id": 4, diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index f6c7ab9d..9b8ec822 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -17,6 +17,7 @@ resources: - grafana-dashboard-mail.yaml - grafana-dashboard-jobs.yaml - grafana-dashboard-testing.yaml + - vmalert-atlas-availability.yaml - dcgm-exporter.yaml - jetson-tegrastats-exporter.yaml - postmark-exporter-service.yaml diff --git a/services/monitoring/vmalert-atlas-availability.yaml b/services/monitoring/vmalert-atlas-availability.yaml new file mode 100644 index 00000000..18925d7f --- /dev/null +++ b/services/monitoring/vmalert-atlas-availability.yaml @@ -0,0 +1,141 @@ +# services/monitoring/vmalert-atlas-availability.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: vmalert-atlas-availability-rules + namespace: monitoring +data: + atlas-availability.yaml: | + groups: + - name: atlas.availability + interval: 15m + rules: + - record: atlas:availability:ratio_1h + expr: | + avg_over_time(( + min( + ( + sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"}) + / 3 + ), + ( + sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"}) + / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1) + ) + ) + )[1h:5m]) + labels: + scope: atlas + rollup: hourly + - record: atlas:availability:ratio_365d + expr: | + avg_over_time(( + min( + ( + sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"}) + / 3 + ), + ( + sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"}) + / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1) + ) + ) + )[365d:6h]) + labels: + scope: atlas + rollup: yearly + +--- + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: vmalert-atlas-availability + namespace: monitoring + +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vmalert-atlas-availability + namespace: monitoring + labels: + app: vmalert-atlas-availability +spec: + replicas: 1 + revisionHistoryLimit: 3 + selector: + matchLabels: + app: vmalert-atlas-availability + template: + metadata: + labels: + app: vmalert-atlas-availability + spec: + serviceAccountName: vmalert-atlas-availability + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: NotIn + values: + - titan-22 + - titan-24 + containers: + - name: vmalert + image: victoriametrics/vmalert:v1.113.0 + args: + - -datasource.url=http://victoria-metrics-single-server:8428 + - -remoteWrite.url=http://victoria-metrics-single-server:8428/api/v1/write + - -rule=/etc/vmalert/rules/*.yaml + - -evaluationInterval=15m + - -httpListenAddr=:8880 + ports: + - name: http + containerPort: 8880 + readinessProbe: + tcpSocket: + port: http + initialDelaySeconds: 5 + periodSeconds: 10 + livenessProbe: + tcpSocket: + port: http + initialDelaySeconds: 20 + periodSeconds: 30 + resources: + requests: + cpu: 25m + memory: 64Mi + limits: + cpu: 500m + memory: 256Mi + volumeMounts: + - name: rules + mountPath: /etc/vmalert/rules + readOnly: true + volumes: + - name: rules + configMap: + name: vmalert-atlas-availability-rules + +--- + +apiVersion: v1 +kind: Service +metadata: + name: vmalert-atlas-availability + namespace: monitoring + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8880" +spec: + selector: + app: vmalert-atlas-availability + ports: + - name: http + port: 8880 + targetPort: http