monitoring: alert on VM outage
This commit is contained in:
parent
ef42dac97b
commit
a988af3262
@ -17,4 +17,5 @@ spec:
|
|||||||
values:
|
values:
|
||||||
syncSecret:
|
syncSecret:
|
||||||
enabled: true
|
enabled: true
|
||||||
enableSecretRotation: false
|
enableSecretRotation: true
|
||||||
|
rotationPollInterval: 2m
|
||||||
|
|||||||
@ -180,6 +180,59 @@ data:
|
|||||||
summary: "{{ $labels.instance }} CPU >90% for 10m"
|
summary: "{{ $labels.instance }} CPU >90% for 10m"
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
- orgId: 1
|
||||||
|
name: atlas-metrics
|
||||||
|
folder: Alerts
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: victoria-metrics-down
|
||||||
|
title: "VictoriaMetrics unavailable (>30m)"
|
||||||
|
condition: C
|
||||||
|
for: "30m"
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
datasourceUid: atlas-vm
|
||||||
|
model:
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
expr: sum(up{job="victoriametrics"})
|
||||||
|
legendFormat: victoriametrics
|
||||||
|
datasource:
|
||||||
|
type: prometheus
|
||||||
|
uid: atlas-vm
|
||||||
|
- refId: B
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: A
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
reducer: last
|
||||||
|
type: reduce
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: B
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
type: threshold
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params: [1]
|
||||||
|
type: lt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
reducer:
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
noDataState: Alerting
|
||||||
|
execErrState: Alerting
|
||||||
|
annotations:
|
||||||
|
summary: "VictoriaMetrics is unavailable for >30m"
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
name: maintenance
|
name: maintenance
|
||||||
folder: Alerts
|
folder: Alerts
|
||||||
|
|||||||
@ -342,7 +342,7 @@ spec:
|
|||||||
GF_SMTP_HOST: "mail.bstein.dev:587"
|
GF_SMTP_HOST: "mail.bstein.dev:587"
|
||||||
GF_SMTP_FROM: "no-reply-grafana@bstein.dev"
|
GF_SMTP_FROM: "no-reply-grafana@bstein.dev"
|
||||||
GF_SMTP_FROM_NAME: "Atlas Grafana"
|
GF_SMTP_FROM_NAME: "Atlas Grafana"
|
||||||
GRAFANA_ALERT_EMAILS: "alerts@bstein.dev"
|
GRAFANA_ALERT_EMAILS: "brad@bstein.dev"
|
||||||
GF_SECURITY_ALLOW_EMBEDDING: "true"
|
GF_SECURITY_ALLOW_EMBEDDING: "true"
|
||||||
GF_AUTH_GENERIC_OAUTH_ENABLED: "true"
|
GF_AUTH_GENERIC_OAUTH_ENABLED: "true"
|
||||||
GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak"
|
GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user