titan-iac/services/monitoring/grafana-alerting-config.yaml

892 lines
30 KiB
YAML

# services/monitoring/grafana-alerting-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-alerting
namespace: monitoring
labels:
grafana_alerting: "1"
data:
alerting.yaml: |
apiVersion: 1
contactPoints:
- orgId: 1
name: email-admins
receivers:
- uid: email-admins
type: email
settings:
addresses: ${GRAFANA_ALERT_EMAILS}
singleEmail: true
policies:
- orgId: 1
receiver: email-admins
group_by:
- grafana_folder
- alertname
group_wait: 1m
group_interval: 30m
repeat_interval: 12h
routes:
- receiver: email-admins
object_matchers:
- [severity, "=", "critical"]
group_wait: 30s
group_interval: 15m
repeat_interval: 4h
- receiver: email-admins
object_matchers:
- [severity, "=", "warning"]
group_wait: 10m
group_interval: 4h
repeat_interval: 48h
rules.yaml: |
apiVersion: 1
groups:
- orgId: 1
name: atlas-disk
folder: Alerts
interval: 1m
rules:
- uid: disk-pressure-root
title: "Node rootfs high (>85%)"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))
legendFormat: '{{node}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [85]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "{{ $labels.node }} rootfs >85% for 10m"
labels:
severity: warning
- uid: disk-growth-1h
title: "Node rootfs growing fast (>3Gi in 1h)"
condition: C
for: "30m"
data:
- refId: A
relativeTimeRange:
from: 3600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: max by (instance, node) ((increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))
legendFormat: '{{node}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [3]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "{{ $labels.node }} rootfs grew >3Gi in the last hour"
labels:
severity: warning
- orgId: 1
name: atlas-cpu
folder: Alerts
interval: 1m
rules:
- uid: cpu-high-10m
title: "Node CPU high (>95% for 20m)"
condition: C
for: 20m
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: ((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
legendFormat: '{{node}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [95]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: OK
annotations:
summary: "{{ $labels.node }} CPU >95% for 20m"
labels:
severity: warning
- orgId: 1
name: atlas-metrics
folder: Alerts
interval: 1m
rules:
- uid: victoria-metrics-down
title: "VictoriaMetrics unavailable (>30m)"
condition: C
for: "30m"
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: sum(up{job="victoriametrics"})
legendFormat: victoriametrics
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: Alerting
execErrState: Alerting
annotations:
summary: "VictoriaMetrics is unavailable for >30m"
labels:
severity: critical
- orgId: 1
name: maintenance
folder: Alerts
interval: 1m
rules:
- uid: maint-sweeper
title: "Maintenance sweeper not ready"
condition: C
for: "5m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"}
legendFormat: '{{daemonset}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "node-image-sweeper not fully ready"
labels:
severity: warning
- uid: logging-node-log-rotation-not-ready
title: "Node log rotation guardrails not ready"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: kube_daemonset_status_number_ready{namespace="logging",daemonset="node-log-rotation"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="logging",daemonset="node-log-rotation"}
legendFormat: '{{daemonset}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "node-log-rotation is not fully ready"
labels:
severity: warning
- uid: maint-ariadne-image-sweeper-stale
title: "Ariadne image sweeper stale (schedule >24h)"
condition: C
for: "5m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [86400]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne image sweeper stale >24h since last success"
labels:
severity: warning
- uid: maint-cron-stale
title: "Maintenance CronJobs stale (legacy disabled)"
condition: C
for: "5m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: legacy
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: OK
annotations:
summary: "Legacy cronjob alert disabled"
labels:
severity: info
- uid: maint-soteria-refresh-stale
title: "Soteria inventory refresh stale (>15m)"
condition: C
for: "15m"
data:
- refId: A
relativeTimeRange:
from: 900
to: 0
datasourceUid: atlas-vm
model:
expr: time() - soteria_inventory_refresh_timestamp_seconds
intervalMs: 60000
maxDataPoints: 43200
legendFormat: soteria-refresh-age-seconds
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [900]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: Alerting
execErrState: Alerting
annotations:
summary: "Soteria inventory telemetry has not refreshed in >15m"
labels:
severity: warning
- uid: maint-soteria-backup-unhealthy
title: "Soteria reports unhealthy PVC backups"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
expr: sum((1 - pvc_backup_health) > bool 0) or on() vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: unhealthy-pvcs
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [0]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Alerting
annotations:
summary: "One or more PVCs are stale, missing, or failed per Soteria backup health"
labels:
severity: warning
- uid: maint-soteria-b2-scan-unhealthy
title: "Soteria B2 usage scan failing or stale"
condition: C
for: "15m"
data:
- refId: A
relativeTimeRange:
from: 1800
to: 0
datasourceUid: atlas-vm
model:
expr: sum((((soteria_b2_scan_success < bool 1) and (time() - soteria_b2_scan_timestamp_seconds > 600)) or (time() - soteria_b2_scan_timestamp_seconds > 1800))) or on() vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: soteria-b2-scan-unhealthy
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [0]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Alerting
annotations:
summary: "Soteria B2 consumption scan is failing or stale for >15m"
labels:
severity: warning
- uid: maint-soteria-authz-denials
title: "Soteria authorization denials elevated"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 900
to: 0
datasourceUid: atlas-vm
model:
expr: sum(increase(soteria_authz_denials_total[15m])) or on() vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: soteria-authz-denials-15m
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [10]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Alerting
annotations:
summary: "Soteria saw >10 authorization denials in 15m"
labels:
severity: warning
- uid: maint-soteria-backup-job-storm
title: "Soteria backup job creation spike"
condition: C
for: "5m"
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
expr: sum(increase(kube_job_created{namespace="maintenance",job_name=~"soteria-backup-.*"}[10m])) or on() vector(0)
intervalMs: 60000
maxDataPoints: 43200
legendFormat: soteria-backup-jobs-created-10m
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [8]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Alerting
annotations:
summary: "Soteria created >8 backup jobs in 10m (possible scheduler storm)"
labels:
severity: warning
- orgId: 1
name: ariadne
folder: Alerts
interval: 1m
rules:
- uid: ariadne-schedule-error
title: "Ariadne schedule task failed"
condition: C
for: "15m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: max by (task) (((time() - ariadne_schedule_last_success_timestamp_seconds{task=~"schedule\\..+"}) * on(task) group_left() (1 - ariadne_schedule_last_status{task=~"schedule\\..+"})))
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [3600]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne schedule has failed for >1h ({{ $labels.task }})"
labels:
severity: warning
- uid: ariadne-scheduler-stalled
title: "Ariadne scheduler behind (>15m)"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [900]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne scheduler behind for {{ $labels.task }}"
labels:
severity: warning
- orgId: 1
name: postmark
folder: Alerts
interval: 1m
rules:
- uid: postmark-bounce
title: "Postmark bounce rate high"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
legendFormat: bounce 1d
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [5]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Postmark 1d bounce rate >5%"
labels:
severity: warning
- uid: postmark-api-down
title: "Postmark exporter down"
condition: C
for: "20m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: avg_over_time(postmark_api_up[15m]) or on() vector(0)
legendFormat: api up
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "Postmark exporter reports sustained API outage"
labels:
severity: warning