# services/monitoring/grafana-alerting-config.yaml apiVersion: v1 kind: ConfigMap metadata: name: grafana-alerting namespace: monitoring labels: grafana_alerting: "1" data: alerting.yaml: | apiVersion: 1 contactPoints: - orgId: 1 name: email-admins receivers: - uid: email-admins type: email settings: addresses: ${GRAFANA_ALERT_EMAILS} singleEmail: true policies: - orgId: 1 receiver: email-admins group_by: - alertname rules.yaml: | apiVersion: 1 groups: - orgId: 1 name: atlas-disk folder: Alerts interval: 1m rules: - uid: disk-pressure-root title: "Node rootfs high (>80%)" condition: C data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")) legendFormat: '{{node}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [80] type: gt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "{{ $labels.node }} rootfs >80% for 10m" labels: severity: warning - orgId: 1 name: maintenance folder: Alerts interval: 1m rules: - uid: maint-sweeper title: "Maintenance sweeper not ready" condition: C data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} legendFormat: '{{daemonset}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [1] type: lt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "node-image-sweeper not fully ready" labels: severity: warning - orgId: 1 name: postmark folder: Alerts interval: 1m rules: - uid: postmark-bounce title: "Postmark bounce rate high" condition: C data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"} legendFormat: bounce 1d datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [5] type: gt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "Postmark 1d bounce rate >5%" labels: severity: warning - uid: postmark-api-down title: "Postmark exporter down" condition: C data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: POSTMARK_API_UP legendFormat: api up datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [1] type: lt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "Postmark exporter reports API down" labels: severity: critical