titan-iac/services/monitoring/grafana-alerting-config.yaml

# services/monitoring/grafana-alerting-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-alerting
  namespace: monitoring
  labels:
    grafana_alerting: "1"
data:
  alerting.yaml: |
    apiVersion: 1
    contactPoints:
      - orgId: 1
        name: email-admins
        receivers:
          - uid: email-admins
            type: email
            settings:
              addresses: ${GRAFANA_ALERT_EMAILS}
              singleEmail: true
    policies:
      - orgId: 1
        receiver: email-admins
        group_by:
          - grafana_folder
          - alertname
        group_wait: 1m
        group_interval: 30m
        repeat_interval: 12h
        routes:
          - receiver: email-admins
            object_matchers:
              - [severity, "=", "critical"]
            group_wait: 30s
            group_interval: 15m
            repeat_interval: 4h
          - receiver: email-admins
            object_matchers:
              - [severity, "=", "warning"]
            group_wait: 10m
            group_interval: 4h
            repeat_interval: 48h
  rules.yaml: |
    apiVersion: 1
    groups:
      - orgId: 1
        name: atlas-disk
        folder: Alerts
        interval: 1m
        rules:
          - uid: disk-pressure-root
            title: "Node rootfs high (>85%)"
            condition: C
            for: "10m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 600
                  to: 0
                datasourceUid: atlas-vm
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
                  expr: avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))
                  legendFormat: '{{node}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [85]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "{{ $labels.node }} rootfs >85% for 10m"
            labels:
              severity: warning
          - uid: disk-growth-1h
            title: "Node rootfs growing fast (>3Gi in 1h)"
            condition: C
            for: "30m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 3600
                  to: 0
                datasourceUid: atlas-vm
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
                  expr: max by (instance, node) ((increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))
                  legendFormat: '{{node}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [3]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "{{ $labels.node }} rootfs grew >3Gi in the last hour"
            labels:
              severity: warning
      - orgId: 1
        name: atlas-cpu
        folder: Alerts
        interval: 1m
        rules:
          - uid: cpu-high-10m
            title: "Node CPU high (>95% for 20m)"
            condition: C
            for: 20m
            data:
              - refId: A
                relativeTimeRange:
                  from: 600
                  to: 0
                datasourceUid: atlas-vm
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
                  expr: ((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
                  legendFormat: '{{node}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [95]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: NoData
            execErrState: OK
            annotations:
              summary: "{{ $labels.node }} CPU >95% for 20m"
            labels:
              severity: warning
      - orgId: 1
        name: atlas-metrics
        folder: Alerts
        interval: 1m
        rules:
          - uid: victoria-metrics-down
            title: "VictoriaMetrics unavailable (>30m)"
            condition: C
            for: "30m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 600
                  to: 0
                datasourceUid: atlas-vm
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
                  expr: sum(up{job="victoriametrics"})
                  legendFormat: victoriametrics
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [1]
                        type: lt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: Alerting
            execErrState: Alerting
            annotations:
              summary: "VictoriaMetrics is unavailable for >30m"
            labels:
              severity: critical
      - orgId: 1
        name: maintenance
        folder: Alerts
        interval: 1m
        rules:
          - uid: maint-sweeper
            title: "Maintenance sweeper not ready"
            condition: C
            for: "5m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: atlas-vm
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
                  expr: kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"}
                  legendFormat: '{{daemonset}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [1]
                        type: lt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "node-image-sweeper not fully ready"
            labels:
              severity: warning
          - uid: logging-node-log-rotation-not-ready
            title: "Node log rotation guardrails not ready"
            condition: C
            for: "10m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 600
                  to: 0
                datasourceUid: atlas-vm
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
                  expr: kube_daemonset_status_number_ready{namespace="logging",daemonset="node-log-rotation"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="logging",daemonset="node-log-rotation"}
                  legendFormat: '{{daemonset}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [1]
                        type: lt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "node-log-rotation is not fully ready"
            labels:
              severity: warning
          - uid: maint-ariadne-image-sweeper-stale
            title: "Ariadne image sweeper stale (schedule >24h)"
            condition: C
            for: "5m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: atlas-vm
                model:
                  expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
                  intervalMs: 60000
                  maxDataPoints: 43200
                  legendFormat: '{{task}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [86400]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: OK
            execErrState: Error
            annotations:
              summary: "Ariadne image sweeper stale >24h since last success"
            labels:
              severity: warning
          - uid: maint-cron-stale
            title: "Maintenance CronJobs stale (legacy disabled)"
            condition: C
            for: "5m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: atlas-vm
                model:
                  expr: vector(0)
                  intervalMs: 60000
                  maxDataPoints: 43200
                  legendFormat: legacy
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [1]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: OK
            execErrState: OK
            annotations:
              summary: "Legacy cronjob alert disabled"
            labels:
              severity: info
          - uid: maint-soteria-refresh-stale
            title: "Soteria inventory refresh stale (>15m)"
            condition: C
            for: "15m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 900
                  to: 0
                datasourceUid: atlas-vm
                model:
                  expr: time() - soteria_inventory_refresh_timestamp_seconds
                  intervalMs: 60000
                  maxDataPoints: 43200
                  legendFormat: soteria-refresh-age-seconds
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [900]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: Alerting
            execErrState: Alerting
            annotations:
              summary: "Soteria inventory telemetry has not refreshed in >15m"
            labels:
              severity: warning
          - uid: maint-soteria-backup-unhealthy
            title: "Soteria reports unhealthy PVC backups"
            condition: C
            for: "10m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 600
                  to: 0
                datasourceUid: atlas-vm
                model:
                  expr: sum((1 - pvc_backup_health) > bool 0) or on() vector(0)
                  intervalMs: 60000
                  maxDataPoints: 43200
                  legendFormat: unhealthy-pvcs
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [0]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: OK
            execErrState: Alerting
            annotations:
              summary: "One or more PVCs are stale, missing, or failed per Soteria backup health"
            labels:
              severity: warning
          - uid: maint-soteria-b2-scan-unhealthy
            title: "Soteria B2 usage scan failing or stale"
            condition: C
            for: "15m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 1800
                  to: 0
                datasourceUid: atlas-vm
                model:
                  expr: sum((((soteria_b2_scan_success < bool 1) and (time() - soteria_b2_scan_timestamp_seconds > 600)) or (time() - soteria_b2_scan_timestamp_seconds > 1800))) or on() vector(0)
                  intervalMs: 60000
                  maxDataPoints: 43200
                  legendFormat: soteria-b2-scan-unhealthy
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [0]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: OK
            execErrState: Alerting
            annotations:
              summary: "Soteria B2 consumption scan is failing or stale for >15m"
            labels:
              severity: warning
          - uid: maint-soteria-authz-denials
            title: "Soteria authorization denials elevated"
            condition: C
            for: "10m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 900
                  to: 0
                datasourceUid: atlas-vm
                model:
                  expr: sum(increase(soteria_authz_denials_total[15m])) or on() vector(0)
                  intervalMs: 60000
                  maxDataPoints: 43200
                  legendFormat: soteria-authz-denials-15m
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [10]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: OK
            execErrState: Alerting
            annotations:
              summary: "Soteria saw >10 authorization denials in 15m"
            labels:
              severity: warning
          - uid: maint-soteria-backup-job-storm
            title: "Soteria backup job creation spike"
            condition: C
            for: "5m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 600
                  to: 0
                datasourceUid: atlas-vm
                model:
                  expr: sum(increase(kube_job_created{namespace="maintenance",job_name=~"soteria-backup-.*"}[10m])) or on() vector(0)
                  intervalMs: 60000
                  maxDataPoints: 43200
                  legendFormat: soteria-backup-jobs-created-10m
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [8]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: OK
            execErrState: Alerting
            annotations:
              summary: "Soteria created >8 backup jobs in 10m (possible scheduler storm)"
            labels:
              severity: warning
      - orgId: 1
        name: ariadne
        folder: Alerts
        interval: 1m
        rules:
          - uid: ariadne-schedule-error
            title: "Ariadne schedule task failed"
            condition: C
            for: "15m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: atlas-vm
                model:
                  expr: max by (task) (((time() - ariadne_schedule_last_success_timestamp_seconds{task=~"schedule\\..+"}) * on(task) group_left() (1 - ariadne_schedule_last_status{task=~"schedule\\..+"})))
                  intervalMs: 60000
                  maxDataPoints: 43200
                  legendFormat: '{{task}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [3600]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: OK
            execErrState: Error
            annotations:
              summary: "Ariadne schedule has failed for >1h ({{ $labels.task }})"
            labels:
              severity: warning
          - uid: ariadne-scheduler-stalled
            title: "Ariadne scheduler behind (>15m)"
            condition: C
            for: "10m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: atlas-vm
                model:
                  expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
                  intervalMs: 60000
                  maxDataPoints: 43200
                  legendFormat: '{{task}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [900]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: OK
            execErrState: Error
            annotations:
              summary: "Ariadne scheduler behind for {{ $labels.task }}"
            labels:
              severity: warning
      - orgId: 1
        name: postmark
        folder: Alerts
        interval: 1m
        rules:
          - uid: postmark-bounce
            title: "Postmark bounce rate high"
            condition: C
            for: "10m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: atlas-vm
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
                  expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
                  legendFormat: bounce 1d
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [5]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: OK
            execErrState: Error
            annotations:
              summary: "Postmark 1d bounce rate >5%"
            labels:
              severity: warning
          - uid: postmark-api-down
            title: "Postmark exporter down"
            condition: C
            for: "20m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: atlas-vm
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
                  expr: avg_over_time(postmark_api_up[15m]) or on() vector(0)
                  legendFormat: api up
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [1]
                        type: lt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "Postmark exporter reports sustained API outage"
            labels:
              severity: warning