# services/monitoring/vmalert-atlas-availability.yaml apiVersion: v1 kind: ConfigMap metadata: name: vmalert-atlas-availability-rules namespace: monitoring data: atlas-availability.yaml: | groups: - name: atlas.availability interval: 15m eval_offset: 14m rules: - record: atlas:availability:ratio_1h expr: | avg_over_time(( min( ( sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"}) / 3 ), ( sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1) ) ) )[1h:5m]) labels: scope: atlas rollup: hourly - record: atlas:availability:ratio_365d expr: | clamp_max(( ( sum(sum_over_time(( min( ( sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"}) / 3 ), ( sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1) ) ) )[365d:1h])) or on() vector(0) ) + clamp_min( 8761 - ( clamp_min( floor( ( time() - ( min(min_over_time(timestamp( min( ( sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"}) / 3 ), ( sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1) ) ) )[365d:1h])) or on() vector(time() + 3600) ) ) / 3600 ) + 1, 0 ) ), 0 ) ) / clamp_min( ( ( sum(count_over_time(( min( ( sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"}) / 3 ), ( sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1) ) ) )[365d:1h])) or on() vector(0) ) + clamp_min( 8761 - ( clamp_min( floor( ( time() - ( min(min_over_time(timestamp( min( ( sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"}) / 3 ), ( sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1) ) ) )[365d:1h])) or on() vector(time() + 3600) ) ) / 3600 ) + 1, 0 ) ), 0 ) ), 1 ), 1) labels: scope: atlas rollup: yearly platform-quality.yaml: | groups: - name: platform.quality interval: 15m rules: - record: platform_quality:test_case_status:count_1h expr: | sum by (suite, branch, test, status) ( max_over_time(platform_quality_gate_test_case_result{exported_job="platform-quality-ci",branch!="",test!="",test!="__no_test_cases__"}[1h]) ) labels: rollup: hourly - record: platform_quality:test_case_pass_rate:percent_1h expr: | 100 * ( sum by (suite, branch, test) ( max_over_time(platform_quality_gate_test_case_result{exported_job="platform-quality-ci",branch!="",test!="",test!="__no_test_cases__",status="passed"}[1h]) ) ) / clamp_min( sum by (suite, branch, test) ( max_over_time(platform_quality_gate_test_case_result{exported_job="platform-quality-ci",branch!="",test!="",test!="__no_test_cases__",status=~"passed|failed|error|skipped"}[1h]) ), 1 ) labels: rollup: hourly --- apiVersion: v1 kind: ServiceAccount metadata: name: vmalert-atlas-availability namespace: monitoring --- apiVersion: apps/v1 kind: Deployment metadata: name: vmalert-atlas-availability namespace: monitoring labels: app: vmalert-atlas-availability spec: replicas: 1 revisionHistoryLimit: 3 selector: matchLabels: app: vmalert-atlas-availability template: metadata: labels: app: vmalert-atlas-availability annotations: bstein.dev/rules-revision: "2026-05-15-availability-query-step" spec: serviceAccountName: vmalert-atlas-availability affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: NotIn values: - titan-22 - titan-24 containers: - name: vmalert image: victoriametrics/vmalert:v1.113.0 args: - -datasource.url=http://victoria-metrics-single-server:8428 - -datasource.queryStep=1h - -remoteWrite.url=http://victoria-metrics-single-server:8428 - -rule=/etc/vmalert/rules/*.yaml - -evaluationInterval=15m - -httpListenAddr=:8880 ports: - name: http containerPort: 8880 readinessProbe: tcpSocket: port: http initialDelaySeconds: 5 periodSeconds: 10 livenessProbe: tcpSocket: port: http initialDelaySeconds: 20 periodSeconds: 30 resources: requests: cpu: 25m memory: 64Mi limits: cpu: 500m memory: 256Mi volumeMounts: - name: rules mountPath: /etc/vmalert/rules readOnly: true volumes: - name: rules configMap: name: vmalert-atlas-availability-rules --- apiVersion: v1 kind: Service metadata: name: vmalert-atlas-availability namespace: monitoring annotations: prometheus.io/scrape: "true" prometheus.io/port: "8880" spec: selector: app: vmalert-atlas-availability ports: - name: http port: 8880 targetPort: http