# services/monitoring/grafana-alerting-config.yaml apiVersion: v1 kind: ConfigMap metadata: name: grafana-alerting namespace: monitoring labels: grafana_alerting: "1" data: alerting.yaml: | apiVersion: 1 contactPoints: - orgId: 1 name: email-admins receivers: - uid: email-admins type: email settings: addresses: ${GRAFANA_ALERT_EMAILS} singleEmail: true policies: - orgId: 1 receiver: email-admins group_by: - grafana_folder - alertname group_wait: 1m group_interval: 30m repeat_interval: 12h routes: - receiver: email-admins object_matchers: - [severity, "=", "critical"] group_wait: 30s group_interval: 15m repeat_interval: 4h - receiver: email-admins object_matchers: - [severity, "=", "warning"] group_wait: 10m group_interval: 4h repeat_interval: 48h rules.yaml: | apiVersion: 1 groups: - orgId: 1 name: atlas-disk folder: Alerts interval: 1m rules: - uid: disk-pressure-root title: "Node rootfs high (>85%)" condition: C for: "10m" data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")) legendFormat: '{{node}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [85] type: gt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "{{ $labels.node }} rootfs >85% for 10m" labels: severity: warning - uid: disk-growth-1h title: "Node rootfs growing fast (>3Gi in 1h)" condition: C for: "30m" data: - refId: A relativeTimeRange: from: 3600 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: max by (instance, node) ((increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")) legendFormat: '{{node}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [3] type: gt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "{{ $labels.node }} rootfs grew >3Gi in the last hour" labels: severity: warning - orgId: 1 name: atlas-cpu folder: Alerts interval: 1m rules: - uid: cpu-high-10m title: "Node CPU high (>95% for 20m)" condition: C for: 20m data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: ((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)") legendFormat: '{{node}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [95] type: gt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: OK annotations: summary: "{{ $labels.node }} CPU >95% for 20m" labels: severity: warning - orgId: 1 name: atlas-metrics folder: Alerts interval: 1m rules: - uid: victoria-metrics-down title: "VictoriaMetrics unavailable (>30m)" condition: C for: "30m" data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: sum(up{job="victoriametrics"}) legendFormat: victoriametrics datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [1] type: lt operator: type: and reducer: type: last type: query noDataState: Alerting execErrState: Alerting annotations: summary: "VictoriaMetrics is unavailable for >30m" labels: severity: critical - orgId: 1 name: maintenance folder: Alerts interval: 1m rules: - uid: maint-sweeper title: "Maintenance sweeper not ready" condition: C for: "5m" data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} legendFormat: '{{daemonset}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [1] type: lt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "node-image-sweeper not fully ready" labels: severity: warning - uid: logging-node-log-rotation-not-ready title: "Node log rotation guardrails not ready" condition: C for: "10m" data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: kube_daemonset_status_number_ready{namespace="logging",daemonset="node-log-rotation"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="logging",daemonset="node-log-rotation"} legendFormat: '{{daemonset}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [1] type: lt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "node-log-rotation is not fully ready" labels: severity: warning - uid: maint-ariadne-image-sweeper-stale title: "Ariadne image sweeper stale (schedule >24h)" condition: C for: "5m" data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"} intervalMs: 60000 maxDataPoints: 43200 legendFormat: '{{task}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [86400] type: gt operator: type: and reducer: type: last type: query noDataState: OK execErrState: Error annotations: summary: "Ariadne image sweeper stale >24h since last success" labels: severity: warning - uid: maint-cron-stale title: "Maintenance CronJobs stale (legacy disabled)" condition: C for: "5m" data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: expr: vector(0) intervalMs: 60000 maxDataPoints: 43200 legendFormat: legacy datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [1] type: gt operator: type: and reducer: type: last type: query noDataState: OK execErrState: OK annotations: summary: "Legacy cronjob alert disabled" labels: severity: info - uid: maint-soteria-refresh-stale title: "Soteria inventory refresh stale (>15m)" condition: C for: "15m" data: - refId: A relativeTimeRange: from: 900 to: 0 datasourceUid: atlas-vm model: expr: time() - soteria_inventory_refresh_timestamp_seconds intervalMs: 60000 maxDataPoints: 43200 legendFormat: soteria-refresh-age-seconds datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [900] type: gt operator: type: and reducer: type: last type: query noDataState: Alerting execErrState: Alerting annotations: summary: "Soteria inventory telemetry has not refreshed in >15m" labels: severity: warning - uid: maint-soteria-backup-unhealthy title: "Soteria reports unhealthy PVC backups" condition: C for: "10m" data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: atlas-vm model: expr: sum((1 - pvc_backup_health) > bool 0) or on() vector(0) intervalMs: 60000 maxDataPoints: 43200 legendFormat: unhealthy-pvcs datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [0] type: gt operator: type: and reducer: type: last type: query noDataState: OK execErrState: Alerting annotations: summary: "One or more PVCs are stale, missing, or failed per Soteria backup health" labels: severity: warning - uid: maint-soteria-b2-scan-unhealthy title: "Soteria B2 usage scan failing or stale" condition: C for: "15m" data: - refId: A relativeTimeRange: from: 1800 to: 0 datasourceUid: atlas-vm model: expr: sum((((soteria_b2_scan_success < bool 1) and (time() - soteria_b2_scan_timestamp_seconds > 600)) or (time() - soteria_b2_scan_timestamp_seconds > 1800))) or on() vector(0) intervalMs: 60000 maxDataPoints: 43200 legendFormat: soteria-b2-scan-unhealthy datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [0] type: gt operator: type: and reducer: type: last type: query noDataState: OK execErrState: Alerting annotations: summary: "Soteria B2 consumption scan is failing or stale for >15m" labels: severity: warning - uid: maint-soteria-authz-denials title: "Soteria authorization denials elevated" condition: C for: "10m" data: - refId: A relativeTimeRange: from: 900 to: 0 datasourceUid: atlas-vm model: expr: sum(increase(soteria_authz_denials_total[15m])) or on() vector(0) intervalMs: 60000 maxDataPoints: 43200 legendFormat: soteria-authz-denials-15m datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [10] type: gt operator: type: and reducer: type: last type: query noDataState: OK execErrState: Alerting annotations: summary: "Soteria saw >10 authorization denials in 15m" labels: severity: warning - orgId: 1 name: ariadne folder: Alerts interval: 1m rules: - uid: ariadne-schedule-error title: "Ariadne schedule task failed" condition: C for: "15m" data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: expr: max by (task) (((time() - ariadne_schedule_last_success_timestamp_seconds{task=~"schedule\\..+"}) * on(task) group_left() (1 - ariadne_schedule_last_status{task=~"schedule\\..+"}))) intervalMs: 60000 maxDataPoints: 43200 legendFormat: '{{task}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [3600] type: gt operator: type: and reducer: type: last type: query noDataState: OK execErrState: Error annotations: summary: "Ariadne schedule has failed for >1h ({{ $labels.task }})" labels: severity: warning - uid: ariadne-scheduler-stalled title: "Ariadne scheduler behind (>15m)" condition: C for: "10m" data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"} intervalMs: 60000 maxDataPoints: 43200 legendFormat: '{{task}}' datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [900] type: gt operator: type: and reducer: type: last type: query noDataState: OK execErrState: Error annotations: summary: "Ariadne scheduler behind for {{ $labels.task }}" labels: severity: warning - orgId: 1 name: postmark folder: Alerts interval: 1m rules: - uid: postmark-bounce title: "Postmark bounce rate high" condition: C for: "10m" data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0) legendFormat: bounce 1d datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [5] type: gt operator: type: and reducer: type: last type: query noDataState: OK execErrState: Error annotations: summary: "Postmark 1d bounce rate >5%" labels: severity: warning - uid: postmark-api-down title: "Postmark exporter down" condition: C for: "20m" data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: atlas-vm model: intervalMs: 60000 maxDataPoints: 43200 expr: avg_over_time(postmark_api_up[15m]) or on() vector(0) legendFormat: api up datasource: type: prometheus uid: atlas-vm - refId: B datasourceUid: __expr__ model: expression: A intervalMs: 60000 maxDataPoints: 43200 reducer: last type: reduce - refId: C datasourceUid: __expr__ model: expression: B intervalMs: 60000 maxDataPoints: 43200 type: threshold conditions: - evaluator: params: [1] type: lt operator: type: and reducer: type: last type: query noDataState: NoData execErrState: Error annotations: summary: "Postmark exporter reports sustained API outage" labels: severity: warning