titan-iac/services/monitoring/grafana-alerting-config.yaml

# services/monitoring/grafana-alerting-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-alerting
  namespace: monitoring
  labels:
    grafana_alerting: "1"
data:
  alerting.yaml: |
    apiVersion: 1
    contactPoints:
      - orgId: 1
        name: email-admins
        receivers:
          - uid: email-admins
            type: email
            settings:
              addresses: ${GRAFANA_ALERT_EMAILS}
              singleEmail: true
    policies:
      - orgId: 1
        receiver: email-admins
        group_by:
          - alertname
  rules.yaml: |
    apiVersion: 1
    groups:
      - orgId: 1
        name: atlas-disk
        folder: Alerts
        interval: 1m
        rules:
          - uid: disk-pressure-root
            title: "Node rootfs high (>80%)"
            condition: C
            data:
              - refId: A
                relativeTimeRange:
                  from: 600
                  to: 0
                datasourceUid: atlas-vm
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
                  expr: avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))
                  legendFormat: '{{node}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [80]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "{{ $labels.node }} rootfs >80% for 10m"
            labels:
              severity: warning
          - uid: disk-growth-1h
            title: "Node rootfs growing fast (>1Gi in 1h)"
            condition: C
            data:
              - refId: A
                relativeTimeRange:
                  from: 3600
                  to: 0
                datasourceUid: atlas-vm
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
                  expr: increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024
                  legendFormat: '{{instance}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [1]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "{{ $labels.instance }} rootfs grew >1Gi in the last hour"
            labels:
              severity: warning
      - orgId: 1
        name: maintenance
        folder: Alerts
        interval: 1m
        rules:
          - uid: maint-sweeper
            title: "Maintenance sweeper not ready"
            condition: C
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: atlas-vm
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
                  expr: kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"}
                  legendFormat: '{{daemonset}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [1]
                        type: lt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "node-image-sweeper not fully ready"
            labels:
              severity: warning
          - uid: maint-cron-stale
            title: "Maintenance CronJobs stale (>3h since success)"
            condition: C
            data:
              - refId: A
                relativeTimeRange:
                  from: 0
                  to: 0
                datasourceUid: atlas-vm
                model:
                  expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})
                  intervalMs: 60000
                  maxDataPoints: 43200
                  legendFormat: '{{cronjob}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [10800]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "Maintenance cronjob stale >3h since last success"
            labels:
              severity: warning
      - orgId: 1
        name: postmark
        folder: Alerts
        interval: 1m
        rules:
          - uid: postmark-bounce
            title: "Postmark bounce rate high"
            condition: C
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: atlas-vm
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
                  expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}
                  legendFormat: bounce 1d
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [5]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "Postmark 1d bounce rate >5%"
            labels:
              severity: warning
          - uid: postmark-api-down
            title: "Postmark exporter down"
            condition: C
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: atlas-vm
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
                  expr: POSTMARK_API_UP
                  legendFormat: api up
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [1]
                        type: lt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "Postmark exporter reports API down"
            labels:
              severity: critical
monitoring: wire grafana smtp sync and alerting provisioning 2026-01-11 00:29:20 -03:00			`# services/monitoring/grafana-alerting-config.yaml`
			`apiVersion: v1`
			`kind: ConfigMap`
			`metadata:`
			`name: grafana-alerting`
			`namespace: monitoring`
			`labels:`
			`grafana_alerting: "1"`
			`data:`
			`alerting.yaml: \|`
			`apiVersion: 1`
			`contactPoints:`
			`- orgId: 1`
			`name: email-admins`
			`receivers:`
			`- uid: email-admins`
			`type: email`
			`settings:`
			`addresses: ${GRAFANA_ALERT_EMAILS}`
			`singleEmail: true`
			`policies:`
			`- orgId: 1`
			`receiver: email-admins`
			`group_by:`
			`- alertname`
monitoring: add alert rules and include titan-20/21 in dashboards 2026-01-11 02:02:47 -03:00			`rules.yaml: \|`
			`apiVersion: 1`
			`groups:`
			`- orgId: 1`
			`name: atlas-disk`
			`folder: Alerts`
			`interval: 1m`
			`rules:`
			`- uid: disk-pressure-root`
			`title: "Node rootfs high (>80%)"`
			`condition: C`
			`data:`
			`- refId: A`
			`relativeTimeRange:`
			`from: 600`
			`to: 0`
			`datasourceUid: atlas-vm`
			`model:`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`expr: avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs\|overlay"} / node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs\|overlay"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))`
			`legendFormat: '{{node}}'`
			`datasource:`
			`type: prometheus`
			`uid: atlas-vm`
			`- refId: B`
			`datasourceUid: __expr__`
			`model:`
			`expression: A`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`reducer: last`
			`type: reduce`
			`- refId: C`
			`datasourceUid: __expr__`
			`model:`
			`expression: B`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`type: threshold`
			`conditions:`
			`- evaluator:`
			`params: [80]`
			`type: gt`
			`operator:`
			`type: and`
			`reducer:`
			`type: last`
			`type: query`
			`noDataState: NoData`
			`execErrState: Error`
			`annotations:`
			`summary: "{{ $labels.node }} rootfs >80% for 10m"`
			`labels:`
			`severity: warning`
monitoring: maintenance panels, extra alerts, update overview 2026-01-11 02:28:39 -03:00			`- uid: disk-growth-1h`
			`title: "Node rootfs growing fast (>1Gi in 1h)"`
			`condition: C`
			`data:`
			`- refId: A`
			`relativeTimeRange:`
			`from: 3600`
			`to: 0`
			`datasourceUid: atlas-vm`
			`model:`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`expr: increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs\|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs\|overlay"})[1h]) / 1024 / 1024 / 1024`
			`legendFormat: '{{instance}}'`
			`datasource:`
			`type: prometheus`
			`uid: atlas-vm`
			`- refId: B`
			`datasourceUid: __expr__`
			`model:`
			`expression: A`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`reducer: last`
			`type: reduce`
			`- refId: C`
			`datasourceUid: __expr__`
			`model:`
			`expression: B`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`type: threshold`
			`conditions:`
			`- evaluator:`
			`params: [1]`
			`type: gt`
			`operator:`
			`type: and`
			`reducer:`
			`type: last`
			`type: query`
			`noDataState: NoData`
			`execErrState: Error`
			`annotations:`
			`summary: "{{ $labels.instance }} rootfs grew >1Gi in the last hour"`
			`labels:`
			`severity: warning`
monitoring: add alert rules and include titan-20/21 in dashboards 2026-01-11 02:02:47 -03:00			`- orgId: 1`
			`name: maintenance`
			`folder: Alerts`
			`interval: 1m`
			`rules:`
			`- uid: maint-sweeper`
			`title: "Maintenance sweeper not ready"`
			`condition: C`
			`data:`
			`- refId: A`
			`relativeTimeRange:`
			`from: 300`
			`to: 0`
			`datasourceUid: atlas-vm`
			`model:`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`expr: kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"}`
			`legendFormat: '{{daemonset}}'`
			`datasource:`
			`type: prometheus`
			`uid: atlas-vm`
			`- refId: B`
			`datasourceUid: __expr__`
			`model:`
			`expression: A`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`reducer: last`
			`type: reduce`
			`- refId: C`
			`datasourceUid: __expr__`
			`model:`
			`expression: B`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`type: threshold`
			`conditions:`
			`- evaluator:`
			`params: [1]`
			`type: lt`
			`operator:`
			`type: and`
			`reducer:`
			`type: last`
			`type: query`
			`noDataState: NoData`
			`execErrState: Error`
			`annotations:`
			`summary: "node-image-sweeper not fully ready"`
			`labels:`
			`severity: warning`
monitoring: maintenance panels, extra alerts, update overview 2026-01-11 02:28:39 -03:00			`- uid: maint-cron-stale`
			`title: "Maintenance CronJobs stale (>3h since success)"`
			`condition: C`
			`data:`
			`- refId: A`
			`relativeTimeRange:`
			`from: 0`
			`to: 0`
			`datasourceUid: atlas-vm`
			`model:`
			`expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper\|grafana-smtp-sync"})`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`legendFormat: '{{cronjob}}'`
			`datasource:`
			`type: prometheus`
			`uid: atlas-vm`
			`- refId: B`
			`datasourceUid: __expr__`
			`model:`
			`expression: A`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`reducer: last`
			`type: reduce`
			`- refId: C`
			`datasourceUid: __expr__`
			`model:`
			`expression: B`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`type: threshold`
			`conditions:`
			`- evaluator:`
			`params: [10800]`
			`type: gt`
			`operator:`
			`type: and`
			`reducer:`
			`type: last`
			`type: query`
			`noDataState: NoData`
			`execErrState: Error`
			`annotations:`
			`summary: "Maintenance cronjob stale >3h since last success"`
			`labels:`
			`severity: warning`
monitoring: add alert rules and include titan-20/21 in dashboards 2026-01-11 02:02:47 -03:00			`- orgId: 1`
			`name: postmark`
			`folder: Alerts`
			`interval: 1m`
			`rules:`
			`- uid: postmark-bounce`
			`title: "Postmark bounce rate high"`
			`condition: C`
			`data:`
			`- refId: A`
			`relativeTimeRange:`
			`from: 300`
			`to: 0`
			`datasourceUid: atlas-vm`
			`model:`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}`
			`legendFormat: bounce 1d`
			`datasource:`
			`type: prometheus`
			`uid: atlas-vm`
			`- refId: B`
			`datasourceUid: __expr__`
			`model:`
			`expression: A`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`reducer: last`
			`type: reduce`
			`- refId: C`
			`datasourceUid: __expr__`
			`model:`
			`expression: B`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`type: threshold`
			`conditions:`
			`- evaluator:`
			`params: [5]`
			`type: gt`
			`operator:`
			`type: and`
			`reducer:`
			`type: last`
			`type: query`
			`noDataState: NoData`
			`execErrState: Error`
			`annotations:`
			`summary: "Postmark 1d bounce rate >5%"`
			`labels:`
			`severity: warning`
			`- uid: postmark-api-down`
			`title: "Postmark exporter down"`
			`condition: C`
			`data:`
			`- refId: A`
			`relativeTimeRange:`
			`from: 300`
			`to: 0`
			`datasourceUid: atlas-vm`
			`model:`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`expr: POSTMARK_API_UP`
			`legendFormat: api up`
			`datasource:`
			`type: prometheus`
			`uid: atlas-vm`
			`- refId: B`
			`datasourceUid: __expr__`
			`model:`
			`expression: A`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`reducer: last`
			`type: reduce`
			`- refId: C`
			`datasourceUid: __expr__`
			`model:`
			`expression: B`
			`intervalMs: 60000`
			`maxDataPoints: 43200`
			`type: threshold`
			`conditions:`
			`- evaluator:`
			`params: [1]`
			`type: lt`
			`operator:`
			`type: and`
			`reducer:`
			`type: last`
			`type: query`
			`noDataState: NoData`
			`execErrState: Error`
			`annotations:`
			`summary: "Postmark exporter reports API down"`
			`labels:`
			`severity: critical`