652 lines
22 KiB
YAML
652 lines
22 KiB
YAML
# services/monitoring/grafana-alerting-config.yaml
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-alerting
|
|
namespace: monitoring
|
|
labels:
|
|
grafana_alerting: "1"
|
|
data:
|
|
alerting.yaml: |
|
|
apiVersion: 1
|
|
contactPoints:
|
|
- orgId: 1
|
|
name: email-admins
|
|
receivers:
|
|
- uid: email-admins
|
|
type: email
|
|
settings:
|
|
addresses: ${GRAFANA_ALERT_EMAILS}
|
|
singleEmail: true
|
|
policies:
|
|
- orgId: 1
|
|
receiver: email-admins
|
|
group_by:
|
|
- grafana_folder
|
|
- alertname
|
|
group_wait: 1m
|
|
group_interval: 30m
|
|
repeat_interval: 12h
|
|
routes:
|
|
- receiver: email-admins
|
|
object_matchers:
|
|
- [severity, "=", "critical"]
|
|
group_wait: 30s
|
|
group_interval: 15m
|
|
repeat_interval: 4h
|
|
- receiver: email-admins
|
|
object_matchers:
|
|
- [severity, "=", "warning"]
|
|
group_wait: 10m
|
|
group_interval: 4h
|
|
repeat_interval: 48h
|
|
rules.yaml: |
|
|
apiVersion: 1
|
|
groups:
|
|
- orgId: 1
|
|
name: atlas-disk
|
|
folder: Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: disk-pressure-root
|
|
title: "Node rootfs high (>85%)"
|
|
condition: C
|
|
for: "10m"
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: atlas-vm
|
|
model:
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
expr: avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))
|
|
legendFormat: '{{node}}'
|
|
datasource:
|
|
type: prometheus
|
|
uid: atlas-vm
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: A
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: B
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
type: threshold
|
|
conditions:
|
|
- evaluator:
|
|
params: [85]
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
reducer:
|
|
type: last
|
|
type: query
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "{{ $labels.node }} rootfs >85% for 10m"
|
|
labels:
|
|
severity: warning
|
|
- uid: disk-growth-1h
|
|
title: "Node rootfs growing fast (>3Gi in 1h)"
|
|
condition: C
|
|
for: "30m"
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 3600
|
|
to: 0
|
|
datasourceUid: atlas-vm
|
|
model:
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
expr: max by (instance, node) ((increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))
|
|
legendFormat: '{{node}}'
|
|
datasource:
|
|
type: prometheus
|
|
uid: atlas-vm
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: A
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: B
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
type: threshold
|
|
conditions:
|
|
- evaluator:
|
|
params: [3]
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
reducer:
|
|
type: last
|
|
type: query
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "{{ $labels.node }} rootfs grew >3Gi in the last hour"
|
|
labels:
|
|
severity: warning
|
|
- orgId: 1
|
|
name: atlas-cpu
|
|
folder: Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: cpu-high-10m
|
|
title: "Node CPU high (>95% for 20m)"
|
|
condition: C
|
|
for: 20m
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: atlas-vm
|
|
model:
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
expr: ((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
|
|
legendFormat: '{{node}}'
|
|
datasource:
|
|
type: prometheus
|
|
uid: atlas-vm
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: A
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: B
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
type: threshold
|
|
conditions:
|
|
- evaluator:
|
|
params: [95]
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
reducer:
|
|
type: last
|
|
type: query
|
|
noDataState: NoData
|
|
execErrState: OK
|
|
annotations:
|
|
summary: "{{ $labels.node }} CPU >95% for 20m"
|
|
labels:
|
|
severity: warning
|
|
- orgId: 1
|
|
name: atlas-metrics
|
|
folder: Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: victoria-metrics-down
|
|
title: "VictoriaMetrics unavailable (>30m)"
|
|
condition: C
|
|
for: "30m"
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: atlas-vm
|
|
model:
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
expr: sum(up{job="victoriametrics"})
|
|
legendFormat: victoriametrics
|
|
datasource:
|
|
type: prometheus
|
|
uid: atlas-vm
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: A
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: B
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
type: threshold
|
|
conditions:
|
|
- evaluator:
|
|
params: [1]
|
|
type: lt
|
|
operator:
|
|
type: and
|
|
reducer:
|
|
type: last
|
|
type: query
|
|
noDataState: Alerting
|
|
execErrState: Alerting
|
|
annotations:
|
|
summary: "VictoriaMetrics is unavailable for >30m"
|
|
labels:
|
|
severity: critical
|
|
- orgId: 1
|
|
name: maintenance
|
|
folder: Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: maint-sweeper
|
|
title: "Maintenance sweeper not ready"
|
|
condition: C
|
|
for: "5m"
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: atlas-vm
|
|
model:
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
expr: kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"}
|
|
legendFormat: '{{daemonset}}'
|
|
datasource:
|
|
type: prometheus
|
|
uid: atlas-vm
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: A
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: B
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
type: threshold
|
|
conditions:
|
|
- evaluator:
|
|
params: [1]
|
|
type: lt
|
|
operator:
|
|
type: and
|
|
reducer:
|
|
type: last
|
|
type: query
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "node-image-sweeper not fully ready"
|
|
labels:
|
|
severity: warning
|
|
- uid: logging-node-log-rotation-not-ready
|
|
title: "Node log rotation guardrails not ready"
|
|
condition: C
|
|
for: "10m"
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: atlas-vm
|
|
model:
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
expr: kube_daemonset_status_number_ready{namespace="logging",daemonset="node-log-rotation"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="logging",daemonset="node-log-rotation"}
|
|
legendFormat: '{{daemonset}}'
|
|
datasource:
|
|
type: prometheus
|
|
uid: atlas-vm
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: A
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: B
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
type: threshold
|
|
conditions:
|
|
- evaluator:
|
|
params: [1]
|
|
type: lt
|
|
operator:
|
|
type: and
|
|
reducer:
|
|
type: last
|
|
type: query
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "node-log-rotation is not fully ready"
|
|
labels:
|
|
severity: warning
|
|
- uid: maint-ariadne-image-sweeper-stale
|
|
title: "Ariadne image sweeper stale (schedule >24h)"
|
|
condition: C
|
|
for: "5m"
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: atlas-vm
|
|
model:
|
|
expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
legendFormat: '{{task}}'
|
|
datasource:
|
|
type: prometheus
|
|
uid: atlas-vm
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: A
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: B
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
type: threshold
|
|
conditions:
|
|
- evaluator:
|
|
params: [86400]
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
reducer:
|
|
type: last
|
|
type: query
|
|
noDataState: OK
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "Ariadne image sweeper stale >24h since last success"
|
|
labels:
|
|
severity: warning
|
|
- uid: maint-cron-stale
|
|
title: "Maintenance CronJobs stale (legacy disabled)"
|
|
condition: C
|
|
for: "5m"
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: atlas-vm
|
|
model:
|
|
expr: vector(0)
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
legendFormat: legacy
|
|
datasource:
|
|
type: prometheus
|
|
uid: atlas-vm
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: A
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: B
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
type: threshold
|
|
conditions:
|
|
- evaluator:
|
|
params: [1]
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
reducer:
|
|
type: last
|
|
type: query
|
|
noDataState: OK
|
|
execErrState: OK
|
|
annotations:
|
|
summary: "Legacy cronjob alert disabled"
|
|
labels:
|
|
severity: info
|
|
- orgId: 1
|
|
name: ariadne
|
|
folder: Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: ariadne-schedule-error
|
|
title: "Ariadne schedule task failed"
|
|
condition: C
|
|
for: "15m"
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: atlas-vm
|
|
model:
|
|
expr: max by (task) (((time() - ariadne_schedule_last_success_timestamp_seconds{task=~"schedule\\..+"}) * on(task) group_left() (1 - ariadne_schedule_last_status{task=~"schedule\\..+"})))
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
legendFormat: '{{task}}'
|
|
datasource:
|
|
type: prometheus
|
|
uid: atlas-vm
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: A
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: B
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
type: threshold
|
|
conditions:
|
|
- evaluator:
|
|
params: [3600]
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
reducer:
|
|
type: last
|
|
type: query
|
|
noDataState: OK
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "Ariadne schedule has failed for >1h ({{ $labels.task }})"
|
|
labels:
|
|
severity: warning
|
|
- uid: ariadne-scheduler-stalled
|
|
title: "Ariadne scheduler behind (>15m)"
|
|
condition: C
|
|
for: "10m"
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: atlas-vm
|
|
model:
|
|
expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
legendFormat: '{{task}}'
|
|
datasource:
|
|
type: prometheus
|
|
uid: atlas-vm
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: A
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: B
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
type: threshold
|
|
conditions:
|
|
- evaluator:
|
|
params: [900]
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
reducer:
|
|
type: last
|
|
type: query
|
|
noDataState: OK
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "Ariadne scheduler behind for {{ $labels.task }}"
|
|
labels:
|
|
severity: warning
|
|
- orgId: 1
|
|
name: postmark
|
|
folder: Alerts
|
|
interval: 1m
|
|
rules:
|
|
- uid: postmark-bounce
|
|
title: "Postmark bounce rate high"
|
|
condition: C
|
|
for: "10m"
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: atlas-vm
|
|
model:
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
|
|
legendFormat: bounce 1d
|
|
datasource:
|
|
type: prometheus
|
|
uid: atlas-vm
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: A
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: B
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
type: threshold
|
|
conditions:
|
|
- evaluator:
|
|
params: [5]
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
reducer:
|
|
type: last
|
|
type: query
|
|
noDataState: OK
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "Postmark 1d bounce rate >5%"
|
|
labels:
|
|
severity: warning
|
|
- uid: postmark-api-down
|
|
title: "Postmark exporter down"
|
|
condition: C
|
|
for: "20m"
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: atlas-vm
|
|
model:
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
expr: avg_over_time(postmark_api_up[15m]) or on() vector(0)
|
|
legendFormat: api up
|
|
datasource:
|
|
type: prometheus
|
|
uid: atlas-vm
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: A
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
expression: B
|
|
intervalMs: 60000
|
|
maxDataPoints: 43200
|
|
type: threshold
|
|
conditions:
|
|
- evaluator:
|
|
params: [1]
|
|
type: lt
|
|
operator:
|
|
type: and
|
|
reducer:
|
|
type: last
|
|
type: query
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "Postmark exporter reports sustained API outage"
|
|
labels:
|
|
severity: warning
|