titan-iac/services/monitoring/grafana-alerting-config.yaml

326 lines
11 KiB
YAML
Raw Normal View History

# services/monitoring/grafana-alerting-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-alerting
namespace: monitoring
labels:
grafana_alerting: "1"
data:
alerting.yaml: |
apiVersion: 1
contactPoints:
- orgId: 1
name: email-admins
receivers:
- uid: email-admins
type: email
settings:
addresses: ${GRAFANA_ALERT_EMAILS}
singleEmail: true
policies:
- orgId: 1
receiver: email-admins
group_by:
- alertname
rules.yaml: |
apiVersion: 1
groups:
- orgId: 1
name: atlas-disk
folder: Alerts
interval: 1m
rules:
- uid: disk-pressure-root
title: "Node rootfs high (>80%)"
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))
legendFormat: '{{node}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [80]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "{{ $labels.node }} rootfs >80% for 10m"
labels:
severity: warning
- uid: disk-growth-1h
title: "Node rootfs growing fast (>1Gi in 1h)"
condition: C
data:
- refId: A
relativeTimeRange:
from: 3600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024
legendFormat: '{{instance}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "{{ $labels.instance }} rootfs grew >1Gi in the last hour"
labels:
severity: warning
- orgId: 1
name: maintenance
folder: Alerts
interval: 1m
rules:
- uid: maint-sweeper
title: "Maintenance sweeper not ready"
condition: C
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"}
legendFormat: '{{daemonset}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "node-image-sweeper not fully ready"
labels:
severity: warning
- uid: maint-cron-stale
title: "Maintenance CronJobs stale (>3h since success)"
condition: C
data:
- refId: A
relativeTimeRange:
from: 0
to: 0
datasourceUid: atlas-vm
model:
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{cronjob}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [10800]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "Maintenance cronjob stale >3h since last success"
labels:
severity: warning
- orgId: 1
name: postmark
folder: Alerts
interval: 1m
rules:
- uid: postmark-bounce
title: "Postmark bounce rate high"
condition: C
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}
legendFormat: bounce 1d
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [5]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "Postmark 1d bounce rate >5%"
labels:
severity: warning
- uid: postmark-api-down
title: "Postmark exporter down"
condition: C
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: POSTMARK_API_UP
legendFormat: api up
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "Postmark exporter reports API down"
labels:
severity: critical