monitoring: fix noisy grafana email alerts and reload rules

This commit is contained in:
Brad Stein 2026-03-30 18:33:02 -03:00
parent 25f096924a
commit 0aeb08d375
2 changed files with 179 additions and 13 deletions

View File

@ -22,7 +22,24 @@ data:
- orgId: 1
receiver: email-admins
group_by:
- grafana_folder
- alertname
group_wait: 1m
group_interval: 30m
repeat_interval: 12h
routes:
- receiver: email-admins
object_matchers:
- [severity, "=", "critical"]
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
- receiver: email-admins
object_matchers:
- [severity, "=", "warning"]
group_wait: 5m
group_interval: 2h
repeat_interval: 24h
rules.yaml: |
apiVersion: 1
groups:
@ -145,7 +162,7 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
legendFormat: '{{instance}}'
datasource:
type: prometheus
@ -286,8 +303,8 @@ data:
summary: "node-image-sweeper not fully ready"
labels:
severity: warning
- uid: maint-cron-stale
title: "Maintenance CronJobs stale (>3h since success)"
- uid: maint-ariadne-image-sweeper-stale
title: "Ariadne image sweeper stale (schedule >8d)"
condition: C
for: "5m"
data:
@ -297,10 +314,10 @@ data:
to: 0
datasourceUid: atlas-vm
model:
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{cronjob}}'
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
@ -321,17 +338,166 @@ data:
type: threshold
conditions:
- evaluator:
params: [10800]
params: [691200]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
noDataState: OK
execErrState: Error
annotations:
summary: "Maintenance cronjob stale >3h since last success"
summary: "Ariadne image sweeper stale >8d since last success"
labels:
severity: warning
- uid: maint-cron-stale
title: "Maintenance CronJobs stale (legacy disabled)"
condition: C
for: "5m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: legacy
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: OK
annotations:
summary: "Legacy cronjob alert disabled"
labels:
severity: info
- orgId: 1
name: ariadne
folder: Alerts
interval: 1m
rules:
- uid: ariadne-schedule-error
title: "Ariadne schedule task failed"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"})
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne schedule failed ({{ $labels.task }})"
labels:
severity: warning
- uid: ariadne-scheduler-stalled
title: "Ariadne scheduler behind (>15m)"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [900]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne scheduler behind for {{ $labels.task }}"
labels:
severity: warning
- orgId: 1
@ -352,7 +518,7 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}
expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
legendFormat: bounce 1d
datasource:
type: prometheus
@ -381,7 +547,7 @@ data:
reducer:
type: last
type: query
noDataState: NoData
noDataState: OK
execErrState: Error
annotations:
summary: "Postmark 1d bounce rate >5%"
@ -400,7 +566,7 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: POSTMARK_API_UP
expr: max(postmark_api_up) or on() vector(0)
legendFormat: api up
datasource:
type: prometheus
@ -429,7 +595,7 @@ data:
reducer:
type: last
type: query
noDataState: NoData
noDataState: OK
execErrState: Error
annotations:
summary: "Postmark exporter reports API down"

View File

@ -286,7 +286,7 @@ spec:
podAnnotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "monitoring"
monitoring.bstein.dev/restart-rev: "1"
monitoring.bstein.dev/restart-rev: "5"
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }}