From a988af3262325b58ee0e4cf14a24a526274fb030 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 11:50:55 -0300 Subject: [PATCH] monitoring: alert on VM outage --- .../vault-csi/secrets-store-csi-driver.yaml | 3 +- .../monitoring/grafana-alerting-config.yaml | 53 +++++++++++++++++++ services/monitoring/helmrelease.yaml | 2 +- 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/infrastructure/vault-csi/secrets-store-csi-driver.yaml b/infrastructure/vault-csi/secrets-store-csi-driver.yaml index 0b249fc..0004c0d 100644 --- a/infrastructure/vault-csi/secrets-store-csi-driver.yaml +++ b/infrastructure/vault-csi/secrets-store-csi-driver.yaml @@ -17,4 +17,5 @@ spec: values: syncSecret: enabled: true - enableSecretRotation: false + enableSecretRotation: true + rotationPollInterval: 2m diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index daa1e29..8713d3d 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -180,6 +180,59 @@ data: summary: "{{ $labels.instance }} CPU >90% for 10m" labels: severity: warning + - orgId: 1 + name: atlas-metrics + folder: Alerts + interval: 1m + rules: + - uid: victoria-metrics-down + title: "VictoriaMetrics unavailable (>30m)" + condition: C + for: "30m" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: sum(up{job="victoriametrics"}) + legendFormat: victoriametrics + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + reducer: + type: last + type: query + noDataState: Alerting + execErrState: Alerting + annotations: + summary: "VictoriaMetrics is unavailable for >30m" + labels: + severity: critical - orgId: 1 name: maintenance folder: Alerts diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index ac24f8a..8e225d4 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -342,7 +342,7 @@ spec: GF_SMTP_HOST: "mail.bstein.dev:587" GF_SMTP_FROM: "no-reply-grafana@bstein.dev" GF_SMTP_FROM_NAME: "Atlas Grafana" - GRAFANA_ALERT_EMAILS: "alerts@bstein.dev" + GRAFANA_ALERT_EMAILS: "brad@bstein.dev" GF_SECURITY_ALLOW_EMBEDDING: "true" GF_AUTH_GENERIC_OAUTH_ENABLED: "true" GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak"