monitoring: alert on ariadne schedules

This commit is contained in:
Brad Stein 2026-01-28 13:47:37 -03:00
parent f913956d08
commit 44788b3132

View File

@ -286,8 +286,8 @@ data:
summary: "node-image-sweeper not fully ready"
labels:
severity: warning
- uid: maint-cron-stale
title: "Maintenance CronJobs stale (>3h since success)"
- uid: maint-ariadne-image-sweeper-stale
title: "Ariadne image sweeper stale (>8d since success)"
condition: C
for: "5m"
data:
@ -297,10 +297,10 @@ data:
to: 0
datasourceUid: atlas-vm
model:
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{cronjob}}'
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
@ -321,17 +321,118 @@ data:
type: threshold
conditions:
- evaluator:
params: [10800]
params: [691200]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
noDataState: OK
execErrState: Error
annotations:
summary: "Maintenance cronjob stale >3h since last success"
summary: "Ariadne image sweeper stale >8d since last success"
labels:
severity: warning
- orgId: 1
name: ariadne
folder: Alerts
interval: 1m
rules:
- uid: ariadne-schedule-error
title: "Ariadne schedule task failed"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: ariadne_schedule_last_status{task=~"schedule\\..+"}
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne schedule failed ({{ $labels.task }})"
labels:
severity: warning
- uid: ariadne-scheduler-stalled
title: "Ariadne scheduler behind (>15m)"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: atlas-vm
model:
expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [900]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne scheduler behind for {{ $labels.task }}"
labels:
severity: warning
- orgId: 1