monitoring: tame email noise and harden postmark alerts

This commit is contained in:
Brad Stein 2026-03-30 18:32:22 -03:00
parent f5dcea860e
commit 5bcff5f405
2 changed files with 22 additions and 5 deletions

View File

@ -22,7 +22,24 @@ data:
- orgId: 1 - orgId: 1
receiver: email-admins receiver: email-admins
group_by: group_by:
- grafana_folder
- alertname - alertname
group_wait: 1m
group_interval: 30m
repeat_interval: 12h
routes:
- receiver: email-admins
object_matchers:
- [severity, "=", "critical"]
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
- receiver: email-admins
object_matchers:
- [severity, "=", "warning"]
group_wait: 5m
group_interval: 2h
repeat_interval: 24h
rules.yaml: | rules.yaml: |
apiVersion: 1 apiVersion: 1
groups: groups:
@ -501,7 +518,7 @@ data:
model: model:
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
expr: postmark_outbound_bounce_rate{window="1d"} expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
legendFormat: bounce 1d legendFormat: bounce 1d
datasource: datasource:
type: prometheus type: prometheus
@ -530,7 +547,7 @@ data:
reducer: reducer:
type: last type: last
type: query type: query
noDataState: NoData noDataState: OK
execErrState: Error execErrState: Error
annotations: annotations:
summary: "Postmark 1d bounce rate >5%" summary: "Postmark 1d bounce rate >5%"
@ -549,7 +566,7 @@ data:
model: model:
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
expr: min_over_time(max by (instance) (postmark_api_up)[5m]) expr: max(postmark_api_up) or on() vector(0)
legendFormat: api up legendFormat: api up
datasource: datasource:
type: prometheus type: prometheus
@ -578,7 +595,7 @@ data:
reducer: reducer:
type: last type: last
type: query type: query
noDataState: NoData noDataState: OK
execErrState: Error execErrState: Error
annotations: annotations:
summary: "Postmark exporter reports API down" summary: "Postmark exporter reports API down"

View File

@ -286,7 +286,7 @@ spec:
podAnnotations: podAnnotations:
vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "monitoring" vault.hashicorp.com/role: "monitoring"
monitoring.bstein.dev/restart-rev: "4" monitoring.bstein.dev/restart-rev: "5"
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
vault.hashicorp.com/agent-inject-template-grafana-env.sh: | vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }} {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}