From 44788b313287c3307d226cf31e419693969ce666 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 13:47:37 -0300 Subject: [PATCH] monitoring: alert on ariadne schedules --- .../monitoring/grafana-alerting-config.yaml | 115 ++++++++++++++++-- 1 file changed, 108 insertions(+), 7 deletions(-) diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index d8402fa..37b66f4 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -286,8 +286,8 @@ data: summary: "node-image-sweeper not fully ready" labels: severity: warning - - uid: maint-cron-stale - title: "Maintenance CronJobs stale (>3h since success)" + - uid: maint-ariadne-image-sweeper-stale + title: "Ariadne image sweeper stale (>8d since success)" condition: C for: "5m" data: @@ -297,10 +297,10 @@ data: to: 0 datasourceUid: atlas-vm model: - expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0) + expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"} intervalMs: 60000 maxDataPoints: 43200 - legendFormat: '{{cronjob}}' + legendFormat: '{{task}}' datasource: type: prometheus uid: atlas-vm @@ -321,17 +321,118 @@ data: type: threshold conditions: - evaluator: - params: [10800] + params: [691200] type: gt operator: type: and reducer: type: last type: query - noDataState: NoData + noDataState: OK execErrState: Error annotations: - summary: "Maintenance cronjob stale >3h since last success" + summary: "Ariadne image sweeper stale >8d since last success" + labels: + severity: warning + - orgId: 1 + name: ariadne + folder: Alerts + interval: 1m + rules: + - uid: ariadne-schedule-error + title: "Ariadne schedule task failed" + condition: C + for: "10m" + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: atlas-vm + model: + expr: ariadne_schedule_last_status{task=~"schedule\\..+"} + intervalMs: 60000 + maxDataPoints: 43200 + legendFormat: '{{task}}' + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: Error + annotations: + summary: "Ariadne schedule failed ({{ $labels.task }})" + labels: + severity: warning + - uid: ariadne-scheduler-stalled + title: "Ariadne scheduler behind (>15m)" + condition: C + for: "10m" + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: atlas-vm + model: + expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"} + intervalMs: 60000 + maxDataPoints: 43200 + legendFormat: '{{task}}' + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [900] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: Error + annotations: + summary: "Ariadne scheduler behind for {{ $labels.task }}" labels: severity: warning - orgId: 1