monitoring: alert on VM outage

2026-01-23 11:50:55 -03:00 · 2026-01-23 11:50:55 -03:00 · a988af3262
commit a988af3262
parent ef42dac97b
3 changed files with 56 additions and 2 deletions
--- a/infrastructure/vault-csi/secrets-store-csi-driver.yaml
+++ b/infrastructure/vault-csi/secrets-store-csi-driver.yaml
@ -17,4 +17,5 @@ spec:
  values:
    syncSecret:
      enabled: true
-    enableSecretRotation: false
+    enableSecretRotation: true
+    rotationPollInterval: 2m
--- a/services/monitoring/grafana-alerting-config.yaml
+++ b/services/monitoring/grafana-alerting-config.yaml
@ -180,6 +180,59 @@ data:
              summary: "{{ $labels.instance }} CPU >90% for 10m"
            labels:
              severity: warning
+      - orgId: 1
+        name: atlas-metrics
+        folder: Alerts
+        interval: 1m
+        rules:
+          - uid: victoria-metrics-down
+            title: "VictoriaMetrics unavailable (>30m)"
+            condition: C
+            for: "30m"
+            data:
+              - refId: A
+                relativeTimeRange:
+                  from: 600
+                  to: 0
+                datasourceUid: atlas-vm
+                model:
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  expr: sum(up{job="victoriametrics"})
+                  legendFormat: victoriametrics
+                  datasource:
+                    type: prometheus
+                    uid: atlas-vm
+              - refId: B
+                datasourceUid: __expr__
+                model:
+                  expression: A
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  reducer: last
+                  type: reduce
+              - refId: C
+                datasourceUid: __expr__
+                model:
+                  expression: B
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  type: threshold
+                  conditions:
+                    - evaluator:
+                        params: [1]
+                        type: lt
+                      operator:
+                        type: and
+                      reducer:
+                        type: last
+                      type: query
+            noDataState: Alerting
+            execErrState: Alerting
+            annotations:
+              summary: "VictoriaMetrics is unavailable for >30m"
+            labels:
+              severity: critical
      - orgId: 1
        name: maintenance
        folder: Alerts
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@ -342,7 +342,7 @@ spec:
      GF_SMTP_HOST: "mail.bstein.dev:587"
      GF_SMTP_FROM: "no-reply-grafana@bstein.dev"
      GF_SMTP_FROM_NAME: "Atlas Grafana"
-      GRAFANA_ALERT_EMAILS: "alerts@bstein.dev"
+      GRAFANA_ALERT_EMAILS: "brad@bstein.dev"
      GF_SECURITY_ALLOW_EMBEDDING: "true"
      GF_AUTH_GENERIC_OAUTH_ENABLED: "true"
      GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak"