# services/monitoring/grafana-alerting-config.yaml apiVersion: v1 kind: ConfigMap metadata: name: grafana-alerting namespace: monitoring labels: grafana_alerting: "1" data: alerting.yaml: | apiVersion: 1 contactPoints: - orgId: 1 name: email-admins receivers: - uid: email-admins type: email settings: addresses: ${GRAFANA_ALERT_EMAILS} singleEmail: true policies: - orgId: 1 receiver: email-admins group_by: - alertname rules.yaml: | apiVersion: 1 groups: - orgId: 1 name: atlas-disk folder: Alerts interval: 1m rules: - uid: disk-pressure-root title: "Node rootfs high (>80%)" condition: C for: "10m" data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")) legendFormat: '{{node}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [80] type: gt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "{{ $labels.node }} rootfs >80% for 10m" labels: severity: warning - uid: disk-growth-1h title: "Node rootfs growing fast (>1Gi in 1h)" condition: C for: "10m" data: - refId: A relativeTimeRange: from: 3600 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024 legendFormat: '{{instance}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [1] type: gt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "{{ $labels.instance }} rootfs grew >1Gi in the last hour" labels: severity: warning - orgId: 1 name: atlas-cpu folder: Alerts interval: 1m rules: - uid: cpu-high-10m title: "Node CPU high (>90% for 10m)" condition: C for: 10m data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\") legendFormat: '{{instance}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [90] type: gt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: NoData annotations: summary: "{{ $labels.node }} CPU >90% for 10m" labels: severity: warning - orgId: 1 name: atlas-metrics folder: Alerts interval: 1m rules: - uid: victoria-metrics-down title: "VictoriaMetrics unavailable (>30m)" condition: C for: "30m" data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: sum(up{job="victoriametrics"}) legendFormat: victoriametrics datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [1] type: lt operator: type: and reducer: type: last type: query noDataState: Alerting execErrState: Alerting annotations: summary: "VictoriaMetrics is unavailable for >30m" labels: severity: critical - orgId: 1 name: maintenance folder: Alerts interval: 1m rules: - uid: maint-sweeper title: "Maintenance sweeper not ready" condition: C for: "5m" data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} legendFormat: '{{daemonset}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [1] type: lt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "node-image-sweeper not fully ready" labels: severity: warning - uid: maint-cron-stale title: "Maintenance CronJobs stale (>3h since success)" condition: C for: "5m" data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0) intervalMs: 60000 maxDataPoints: 43200 legendFormat: '{{cronjob}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [10800] type: gt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "Maintenance cronjob stale >3h since last success" labels: severity: warning - orgId: 1 name: postmark folder: Alerts interval: 1m rules: - uid: postmark-bounce title: "Postmark bounce rate high" condition: C for: "10m" data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"} legendFormat: bounce 1d datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [5] type: gt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "Postmark 1d bounce rate >5%" labels: severity: warning - uid: postmark-api-down title: "Postmark exporter down" condition: C for: "5m" data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: POSTMARK_API_UP legendFormat: api up datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [1] type: lt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "Postmark exporter reports API down" labels: severity: critical