2026-01-11 00:29:20 -03:00
# services/monitoring/grafana-alerting-config.yaml
apiVersion : v1
kind : ConfigMap
metadata :
name : grafana-alerting
namespace : monitoring
labels :
grafana_alerting : "1"
data :
alerting.yaml : |
apiVersion : 1
contactPoints :
- orgId : 1
name : email-admins
receivers :
- uid : email-admins
type : email
settings :
addresses : ${GRAFANA_ALERT_EMAILS}
singleEmail : true
policies :
- orgId : 1
receiver : email-admins
group_by :
- alertname
2026-01-11 02:02:47 -03:00
rules.yaml : |
apiVersion : 1
groups :
- orgId : 1
name : atlas-disk
folder : Alerts
interval : 1m
rules :
- uid : disk-pressure-root
title : "Node rootfs high (>80%)"
condition : C
2026-01-11 23:46:24 -03:00
for : "10m"
2026-01-11 02:02:47 -03:00
data :
- refId : A
relativeTimeRange :
from : 600
to : 0
datasourceUid : atlas-vm
model :
intervalMs : 60000
maxDataPoints : 43200
expr : avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))
legendFormat : '{{node}}'
datasource :
type : prometheus
uid : atlas-vm
- refId : B
datasourceUid : __expr__
model :
expression : A
intervalMs : 60000
maxDataPoints : 43200
reducer : last
type : reduce
- refId : C
datasourceUid : __expr__
model :
expression : B
intervalMs : 60000
maxDataPoints : 43200
type : threshold
conditions :
- evaluator :
params : [ 80 ]
type : gt
operator :
type : and
reducer :
type : last
type : query
noDataState : NoData
execErrState : Error
annotations :
summary : "{{ $labels.node }} rootfs >80% for 10m"
labels :
severity : warning
2026-01-11 02:28:39 -03:00
- uid : disk-growth-1h
title : "Node rootfs growing fast (>1Gi in 1h)"
condition : C
2026-01-11 23:46:24 -03:00
for : "10m"
2026-01-11 02:28:39 -03:00
data :
- refId : A
relativeTimeRange :
from : 3600
to : 0
datasourceUid : atlas-vm
model :
intervalMs : 60000
maxDataPoints : 43200
expr : increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024
legendFormat : '{{instance}}'
datasource :
type : prometheus
uid : atlas-vm
- refId : B
datasourceUid : __expr__
model :
expression : A
intervalMs : 60000
maxDataPoints : 43200
reducer : last
type : reduce
- refId : C
datasourceUid : __expr__
model :
expression : B
intervalMs : 60000
maxDataPoints : 43200
type : threshold
conditions :
- evaluator :
params : [ 1 ]
type : gt
operator :
type : and
reducer :
type : last
type : query
noDataState : NoData
execErrState : Error
annotations :
summary : "{{ $labels.instance }} rootfs grew >1Gi in the last hour"
labels :
severity : warning
2026-01-11 08:59:51 -03:00
- orgId : 1
name : atlas-cpu
folder : Alerts
interval : 1m
rules :
- uid : cpu-high-10m
title : "Node CPU high (>90% for 10m)"
condition : C
2026-01-11 23:46:24 -03:00
for : 10m
2026-01-11 08:59:51 -03:00
data :
- refId : A
relativeTimeRange :
from : 600
to : 0
datasourceUid : atlas-vm
model :
intervalMs : 60000
maxDataPoints : 43200
2026-01-27 23:23:42 -03:00
expr : avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
2026-01-11 08:59:51 -03:00
legendFormat : '{{instance}}'
datasource :
type : prometheus
uid : atlas-vm
- refId : B
datasourceUid : __expr__
model :
expression : A
intervalMs : 60000
maxDataPoints : 43200
reducer : last
type : reduce
- refId : C
datasourceUid : __expr__
model :
expression : B
intervalMs : 60000
maxDataPoints : 43200
type : threshold
conditions :
- evaluator :
params : [ 90 ]
type : gt
operator :
type : and
reducer :
type : last
type : query
noDataState : NoData
2026-01-27 23:34:11 -03:00
execErrState : OK
2026-01-11 08:59:51 -03:00
annotations :
2026-01-27 23:23:42 -03:00
summary : "{{ $labels.node }} CPU >90% for 10m"
2026-01-11 08:59:51 -03:00
labels :
severity : warning
2026-01-23 11:50:55 -03:00
- orgId : 1
name : atlas-metrics
folder : Alerts
interval : 1m
rules :
- uid : victoria-metrics-down
title : "VictoriaMetrics unavailable (>30m)"
condition : C
for : "30m"
data :
- refId : A
relativeTimeRange :
from : 600
to : 0
datasourceUid : atlas-vm
model :
intervalMs : 60000
maxDataPoints : 43200
expr : sum(up{job="victoriametrics"})
legendFormat : victoriametrics
datasource :
type : prometheus
uid : atlas-vm
- refId : B
datasourceUid : __expr__
model :
expression : A
intervalMs : 60000
maxDataPoints : 43200
reducer : last
type : reduce
- refId : C
datasourceUid : __expr__
model :
expression : B
intervalMs : 60000
maxDataPoints : 43200
type : threshold
conditions :
- evaluator :
params : [ 1 ]
type : lt
operator :
type : and
reducer :
type : last
type : query
noDataState : Alerting
execErrState : Alerting
annotations :
summary : "VictoriaMetrics is unavailable for >30m"
labels :
severity : critical
2026-01-11 02:02:47 -03:00
- orgId : 1
name : maintenance
folder : Alerts
interval : 1m
rules :
- uid : maint-sweeper
title : "Maintenance sweeper not ready"
condition : C
2026-01-11 23:46:24 -03:00
for : "5m"
2026-01-11 02:02:47 -03:00
data :
- refId : A
relativeTimeRange :
from : 300
to : 0
datasourceUid : atlas-vm
model :
intervalMs : 60000
maxDataPoints : 43200
expr : kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"}
legendFormat : '{{daemonset}}'
datasource :
type : prometheus
uid : atlas-vm
- refId : B
datasourceUid : __expr__
model :
expression : A
intervalMs : 60000
maxDataPoints : 43200
reducer : last
type : reduce
- refId : C
datasourceUid : __expr__
model :
expression : B
intervalMs : 60000
maxDataPoints : 43200
type : threshold
conditions :
- evaluator :
params : [ 1 ]
type : lt
operator :
type : and
reducer :
type : last
type : query
noDataState : NoData
execErrState : Error
annotations :
summary : "node-image-sweeper not fully ready"
labels :
severity : warning
2026-01-11 02:28:39 -03:00
- uid : maint-cron-stale
title : "Maintenance CronJobs stale (>3h since success)"
condition : C
2026-01-11 23:46:24 -03:00
for : "5m"
2026-01-11 02:28:39 -03:00
data :
- refId : A
relativeTimeRange :
2026-01-11 23:46:24 -03:00
from : 300
2026-01-11 02:28:39 -03:00
to : 0
datasourceUid : atlas-vm
model :
2026-01-27 23:23:42 -03:00
expr : time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
2026-01-11 02:28:39 -03:00
intervalMs : 60000
maxDataPoints : 43200
legendFormat : '{{cronjob}}'
datasource :
type : prometheus
uid : atlas-vm
- refId : B
datasourceUid : __expr__
model :
expression : A
intervalMs : 60000
maxDataPoints : 43200
reducer : last
type : reduce
- refId : C
datasourceUid : __expr__
model :
expression : B
intervalMs : 60000
maxDataPoints : 43200
type : threshold
conditions :
- evaluator :
params : [ 10800 ]
type : gt
operator :
type : and
reducer :
type : last
type : query
noDataState : NoData
execErrState : Error
annotations :
summary : "Maintenance cronjob stale >3h since last success"
labels :
severity : warning
2026-01-11 02:02:47 -03:00
- orgId : 1
name : postmark
folder : Alerts
interval : 1m
rules :
- uid : postmark-bounce
title : "Postmark bounce rate high"
condition : C
2026-01-11 23:46:24 -03:00
for : "10m"
2026-01-11 02:02:47 -03:00
data :
- refId : A
relativeTimeRange :
from : 300
to : 0
datasourceUid : atlas-vm
model :
intervalMs : 60000
maxDataPoints : 43200
expr : POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}
legendFormat : bounce 1d
datasource :
type : prometheus
uid : atlas-vm
- refId : B
datasourceUid : __expr__
model :
expression : A
intervalMs : 60000
maxDataPoints : 43200
reducer : last
type : reduce
- refId : C
datasourceUid : __expr__
model :
expression : B
intervalMs : 60000
maxDataPoints : 43200
type : threshold
conditions :
- evaluator :
params : [ 5 ]
type : gt
operator :
type : and
reducer :
type : last
type : query
noDataState : NoData
execErrState : Error
annotations :
summary : "Postmark 1d bounce rate >5%"
labels :
severity : warning
- uid : postmark-api-down
title : "Postmark exporter down"
condition : C
2026-01-11 23:46:24 -03:00
for : "5m"
2026-01-11 02:02:47 -03:00
data :
- refId : A
relativeTimeRange :
from : 300
to : 0
datasourceUid : atlas-vm
model :
intervalMs : 60000
maxDataPoints : 43200
expr : POSTMARK_API_UP
legendFormat : api up
datasource :
type : prometheus
uid : atlas-vm
- refId : B
datasourceUid : __expr__
model :
expression : A
intervalMs : 60000
maxDataPoints : 43200
reducer : last
type : reduce
- refId : C
datasourceUid : __expr__
model :
expression : B
intervalMs : 60000
maxDataPoints : 43200
type : threshold
conditions :
- evaluator :
params : [ 1 ]
type : lt
operator :
type : and
reducer :
type : last
type : query
noDataState : NoData
execErrState : Error
annotations :
summary : "Postmark exporter reports API down"
labels :
severity : critical