monitoring: alert on VM outage

This commit is contained in:
Brad Stein 2026-01-23 11:50:55 -03:00
parent ef42dac97b
commit a988af3262
3 changed files with 56 additions and 2 deletions

View File

@ -17,4 +17,5 @@ spec:
values:
syncSecret:
enabled: true
enableSecretRotation: false
enableSecretRotation: true
rotationPollInterval: 2m

View File

@ -180,6 +180,59 @@ data:
summary: "{{ $labels.instance }} CPU >90% for 10m"
labels:
severity: warning
- orgId: 1
name: atlas-metrics
folder: Alerts
interval: 1m
rules:
- uid: victoria-metrics-down
title: "VictoriaMetrics unavailable (>30m)"
condition: C
for: "30m"
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: sum(up{job="victoriametrics"})
legendFormat: victoriametrics
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: Alerting
execErrState: Alerting
annotations:
summary: "VictoriaMetrics is unavailable for >30m"
labels:
severity: critical
- orgId: 1
name: maintenance
folder: Alerts

View File

@ -342,7 +342,7 @@ spec:
GF_SMTP_HOST: "mail.bstein.dev:587"
GF_SMTP_FROM: "no-reply-grafana@bstein.dev"
GF_SMTP_FROM_NAME: "Atlas Grafana"
GRAFANA_ALERT_EMAILS: "alerts@bstein.dev"
GRAFANA_ALERT_EMAILS: "brad@bstein.dev"
GF_SECURITY_ALLOW_EMBEDDING: "true"
GF_AUTH_GENERIC_OAUTH_ENABLED: "true"
GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak"