monitoring: precompute atlas availability rollup

2026-05-10 15:40:12 -03:00 · 2026-05-10 15:40:12 -03:00 · 7f7dde01de
commit 7f7dde01de
parent 32ffe30145
6 changed files with 158 additions and 17 deletions
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -312,9 +312,9 @@ STUCK_TERMINATING_EXPR = (
    ')) '
    "or on() vector(0)"
 )
-UPTIME_WINDOW = "30d"
-# Keep the subquery step coarse so we don't request an excessive number of points.
-UPTIME_STEP = "1h"
+UPTIME_WINDOW = "365d"
+# vmalert precomputes the expensive long-window rollup so Grafana only reads one compact series.
+UPTIME_RECORDING_EXPR = f'atlas:availability:ratio_{UPTIME_WINDOW}{{scope="atlas"}}'
 TRAEFIK_READY_EXPR = (
    "("
    'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
@ -335,7 +335,7 @@ NODE_TIEBREAKER = " + ".join(
    f"({node_filter(node)}) * 1e-6 * {idx}"
    for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
 )
-UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:{UPTIME_STEP}])"
+UPTIME_AVG_EXPR = UPTIME_RECORDING_EXPR
 UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
 UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
 UPTIME_THRESHOLDS = {
@ -1433,7 +1433,7 @@ def build_overview():
        },
        {
            "id": 27,
-            "title": "Atlas Availability (30d)",
+            "title": "Atlas Availability (365d)",
            "expr": UPTIME_PERCENT_EXPR,
            "kind": "stat",
            "thresholds": UPTIME_PERCENT_THRESHOLDS,
@ -1441,7 +1441,7 @@ def build_overview():
            "decimals": 4,
            "text_mode": "value",
            "instant": True,
-            "description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime.",
+            "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load.",
        },
        {
            "id": 4,
--- a/scripts/tests/test_dashboards_render_atlas.py
+++ b/scripts/tests/test_dashboards_render_atlas.py
@ -50,16 +50,15 @@ def test_node_filter_and_expr_helpers():
    assert "node_memory_MemAvailable_bytes" in mem_expr


-def test_overview_availability_panel_is_recent_and_instant():
+def test_overview_availability_panel_uses_recorded_365d_rollup():
    mod = load_module()
    dashboard = mod.build_overview()
    panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27)

-    assert panel["title"] == "Atlas Availability (30d)"
-    assert "[30d:1h]" in panel["targets"][0]["expr"]
-    assert "365d" not in panel["targets"][0]["expr"]
+    assert panel["title"] == "Atlas Availability (365d)"
+    assert panel["targets"][0]["expr"] == 'atlas:availability:ratio_365d{scope="atlas"}'
    assert panel["targets"][0]["instant"] is True
-    assert "pre-metric history" in panel["description"]
+    assert "precomputed" in panel["description"]


 def test_render_configmap_writes(tmp_path):
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@ -213,7 +213,7 @@
    {
      "id": 27,
      "type": "stat",
-      "title": "Atlas Availability (30d)",
+      "title": "Atlas Availability (365d)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -226,7 +226,7 @@
      },
      "targets": [
        {
-          "expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])",
+          "expr": "atlas:availability:ratio_365d{scope=\"atlas\"}",
          "refId": "A",
          "instant": true
        }
@ -283,7 +283,7 @@
        },
        "textMode": "value"
      },
-      "description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime."
+      "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load."
    },
    {
      "id": 4,
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@ -222,7 +222,7 @@ data:
        {
          "id": 27,
          "type": "stat",
-          "title": "Atlas Availability (30d)",
+          "title": "Atlas Availability (365d)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -235,7 +235,7 @@ data:
          },
          "targets": [
            {
-              "expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])",
+              "expr": "atlas:availability:ratio_365d{scope=\"atlas\"}",
              "refId": "A",
              "instant": true
            }
@ -292,7 +292,7 @@ data:
            },
            "textMode": "value"
          },
-          "description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime."
+          "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. vmalert evaluates the larger rollup in the background so Grafana does not recompute a year of raw samples on every dashboard load."
        },
        {
          "id": 4,
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@ -17,6 +17,7 @@ resources:
  - grafana-dashboard-mail.yaml
  - grafana-dashboard-jobs.yaml
  - grafana-dashboard-testing.yaml
+  - vmalert-atlas-availability.yaml
  - dcgm-exporter.yaml
  - jetson-tegrastats-exporter.yaml
  - postmark-exporter-service.yaml
--- a/services/monitoring/vmalert-atlas-availability.yaml
+++ b/services/monitoring/vmalert-atlas-availability.yaml
@ -0,0 +1,141 @@
+# services/monitoring/vmalert-atlas-availability.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: vmalert-atlas-availability-rules
+  namespace: monitoring
+data:
+  atlas-availability.yaml: |
+    groups:
+      - name: atlas.availability
+        interval: 15m
+        rules:
+          - record: atlas:availability:ratio_1h
+            expr: |
+              avg_over_time((
+                min(
+                  (
+                    sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})
+                    / 3
+                  ),
+                  (
+                    sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
+                    / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
+                  )
+                )
+              )[1h:5m])
+            labels:
+              scope: atlas
+              rollup: hourly
+          - record: atlas:availability:ratio_365d
+            expr: |
+              avg_over_time((
+                min(
+                  (
+                    sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})
+                    / 3
+                  ),
+                  (
+                    sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})
+                    / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)
+                  )
+                )
+              )[365d:6h])
+            labels:
+              scope: atlas
+              rollup: yearly
+
+---
+
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: vmalert-atlas-availability
+  namespace: monitoring
+
+---
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vmalert-atlas-availability
+  namespace: monitoring
+  labels:
+    app: vmalert-atlas-availability
+spec:
+  replicas: 1
+  revisionHistoryLimit: 3
+  selector:
+    matchLabels:
+      app: vmalert-atlas-availability
+  template:
+    metadata:
+      labels:
+        app: vmalert-atlas-availability
+    spec:
+      serviceAccountName: vmalert-atlas-availability
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: kubernetes.io/hostname
+                    operator: NotIn
+                    values:
+                      - titan-22
+                      - titan-24
+      containers:
+        - name: vmalert
+          image: victoriametrics/vmalert:v1.113.0
+          args:
+            - -datasource.url=http://victoria-metrics-single-server:8428
+            - -remoteWrite.url=http://victoria-metrics-single-server:8428/api/v1/write
+            - -rule=/etc/vmalert/rules/*.yaml
+            - -evaluationInterval=15m
+            - -httpListenAddr=:8880
+          ports:
+            - name: http
+              containerPort: 8880
+          readinessProbe:
+            tcpSocket:
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 10
+          livenessProbe:
+            tcpSocket:
+              port: http
+            initialDelaySeconds: 20
+            periodSeconds: 30
+          resources:
+            requests:
+              cpu: 25m
+              memory: 64Mi
+            limits:
+              cpu: 500m
+              memory: 256Mi
+          volumeMounts:
+            - name: rules
+              mountPath: /etc/vmalert/rules
+              readOnly: true
+      volumes:
+        - name: rules
+          configMap:
+            name: vmalert-atlas-availability-rules
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: vmalert-atlas-availability
+  namespace: monitoring
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8880"
+spec:
+  selector:
+    app: vmalert-atlas-availability
+  ports:
+    - name: http
+      port: 8880
+      targetPort: http