monitoring: tame email noise and harden postmark alerts

This commit is contained in:
Brad Stein 2026-03-30 18:32:22 -03:00
parent e93aa6e33b
commit ca273c7337
2 changed files with 22 additions and 5 deletions

View File

@ -22,7 +22,24 @@ data:
- orgId: 1
receiver: email-admins
group_by:
- grafana_folder
- alertname
group_wait: 1m
group_interval: 30m
repeat_interval: 12h
routes:
- receiver: email-admins
object_matchers:
- [severity, "=", "critical"]
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
- receiver: email-admins
object_matchers:
- [severity, "=", "warning"]
group_wait: 5m
group_interval: 2h
repeat_interval: 24h
rules.yaml: |
apiVersion: 1
groups:
@ -501,7 +518,7 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: postmark_outbound_bounce_rate{window="1d"}
expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
legendFormat: bounce 1d
datasource:
type: prometheus
@ -530,7 +547,7 @@ data:
reducer:
type: last
type: query
noDataState: NoData
noDataState: OK
execErrState: Error
annotations:
summary: "Postmark 1d bounce rate >5%"
@ -549,7 +566,7 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: min_over_time(max by (instance) (postmark_api_up)[5m])
expr: max(postmark_api_up) or on() vector(0)
legendFormat: api up
datasource:
type: prometheus
@ -578,7 +595,7 @@ data:
reducer:
type: last
type: query
noDataState: NoData
noDataState: OK
execErrState: Error
annotations:
summary: "Postmark exporter reports API down"

View File

@ -286,7 +286,7 @@ spec:
podAnnotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "monitoring"
monitoring.bstein.dev/restart-rev: "4"
monitoring.bstein.dev/restart-rev: "5"
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }}