diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index 33ac7396..6748a54d 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -22,7 +22,24 @@ data: - orgId: 1 receiver: email-admins group_by: + - grafana_folder - alertname + group_wait: 1m + group_interval: 30m + repeat_interval: 12h + routes: + - receiver: email-admins + object_matchers: + - [severity, "=", "critical"] + group_wait: 30s + group_interval: 5m + repeat_interval: 2h + - receiver: email-admins + object_matchers: + - [severity, "=", "warning"] + group_wait: 5m + group_interval: 2h + repeat_interval: 24h rules.yaml: | apiVersion: 1 groups: @@ -145,7 +162,7 @@ data: model: intervalMs: 60000 maxDataPoints: 43200 - expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\") + expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)") legendFormat: '{{instance}}' datasource: type: prometheus @@ -286,8 +303,8 @@ data: summary: "node-image-sweeper not fully ready" labels: severity: warning - - uid: maint-cron-stale - title: "Maintenance CronJobs stale (>3h since success)" + - uid: maint-ariadne-image-sweeper-stale + title: "Ariadne image sweeper stale (schedule >8d)" condition: C for: "5m" data: @@ -297,10 +314,10 @@ data: to: 0 datasourceUid: atlas-vm model: - expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0) + expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"} intervalMs: 60000 maxDataPoints: 43200 - legendFormat: '{{cronjob}}' + legendFormat: '{{task}}' datasource: type: prometheus uid: atlas-vm @@ -321,17 +338,166 @@ data: type: threshold conditions: - evaluator: - params: [10800] + params: [691200] type: gt operator: type: and reducer: type: last type: query - noDataState: NoData + noDataState: OK execErrState: Error annotations: - summary: "Maintenance cronjob stale >3h since last success" + summary: "Ariadne image sweeper stale >8d since last success" + labels: + severity: warning + - uid: maint-cron-stale + title: "Maintenance CronJobs stale (legacy disabled)" + condition: C + for: "5m" + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: atlas-vm + model: + expr: vector(0) + intervalMs: 60000 + maxDataPoints: 43200 + legendFormat: legacy + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [1] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: OK + annotations: + summary: "Legacy cronjob alert disabled" + labels: + severity: info + - orgId: 1 + name: ariadne + folder: Alerts + interval: 1m + rules: + - uid: ariadne-schedule-error + title: "Ariadne schedule task failed" + condition: C + for: "10m" + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: atlas-vm + model: + expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"}) + intervalMs: 60000 + maxDataPoints: 43200 + legendFormat: '{{task}}' + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: Error + annotations: + summary: "Ariadne schedule failed ({{ $labels.task }})" + labels: + severity: warning + - uid: ariadne-scheduler-stalled + title: "Ariadne scheduler behind (>15m)" + condition: C + for: "10m" + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: atlas-vm + model: + expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"} + intervalMs: 60000 + maxDataPoints: 43200 + legendFormat: '{{task}}' + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [900] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: Error + annotations: + summary: "Ariadne scheduler behind for {{ $labels.task }}" labels: severity: warning - orgId: 1 @@ -352,7 +518,7 @@ data: model: intervalMs: 60000 maxDataPoints: 43200 - expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"} + expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0) legendFormat: bounce 1d datasource: type: prometheus @@ -381,7 +547,7 @@ data: reducer: type: last type: query - noDataState: NoData + noDataState: OK execErrState: Error annotations: summary: "Postmark 1d bounce rate >5%" @@ -400,7 +566,7 @@ data: model: intervalMs: 60000 maxDataPoints: 43200 - expr: POSTMARK_API_UP + expr: max(postmark_api_up) or on() vector(0) legendFormat: api up datasource: type: prometheus @@ -429,7 +595,7 @@ data: reducer: type: last type: query - noDataState: NoData + noDataState: OK execErrState: Error annotations: summary: "Postmark exporter reports API down" diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 66517389..55655405 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -286,7 +286,7 @@ spec: podAnnotations: vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "monitoring" - monitoring.bstein.dev/restart-rev: "1" + monitoring.bstein.dev/restart-rev: "5" vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" vault.hashicorp.com/agent-inject-template-grafana-env.sh: | {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}