monitoring: alert on ariadne schedules
This commit is contained in:
parent
f913956d08
commit
44788b3132
@ -286,8 +286,8 @@ data:
|
||||
summary: "node-image-sweeper not fully ready"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: maint-cron-stale
|
||||
title: "Maintenance CronJobs stale (>3h since success)"
|
||||
- uid: maint-ariadne-image-sweeper-stale
|
||||
title: "Ariadne image sweeper stale (>8d since success)"
|
||||
condition: C
|
||||
for: "5m"
|
||||
data:
|
||||
@ -297,10 +297,10 @@ data:
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
|
||||
expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
legendFormat: '{{cronjob}}'
|
||||
legendFormat: '{{task}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
@ -321,17 +321,118 @@ data:
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [10800]
|
||||
params: [691200]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: NoData
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Maintenance cronjob stale >3h since last success"
|
||||
summary: "Ariadne image sweeper stale >8d since last success"
|
||||
labels:
|
||||
severity: warning
|
||||
- orgId: 1
|
||||
name: ariadne
|
||||
folder: Alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: ariadne-schedule-error
|
||||
title: "Ariadne schedule task failed"
|
||||
condition: C
|
||||
for: "10m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
expr: ariadne_schedule_last_status{task=~"schedule\\..+"}
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
legendFormat: '{{task}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
- refId: B
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
type: reduce
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: B
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [1]
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Ariadne schedule failed ({{ $labels.task }})"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: ariadne-scheduler-stalled
|
||||
title: "Ariadne scheduler behind (>15m)"
|
||||
condition: C
|
||||
for: "10m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
legendFormat: '{{task}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
- refId: B
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
type: reduce
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: B
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [900]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Ariadne scheduler behind for {{ $labels.task }}"
|
||||
labels:
|
||||
severity: warning
|
||||
- orgId: 1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user