monitoring: fix noisy grafana email alerts and reload rules
This commit is contained in:
parent
25f096924a
commit
0aeb08d375
@ -22,7 +22,24 @@ data:
|
|||||||
- orgId: 1
|
- orgId: 1
|
||||||
receiver: email-admins
|
receiver: email-admins
|
||||||
group_by:
|
group_by:
|
||||||
|
- grafana_folder
|
||||||
- alertname
|
- alertname
|
||||||
|
group_wait: 1m
|
||||||
|
group_interval: 30m
|
||||||
|
repeat_interval: 12h
|
||||||
|
routes:
|
||||||
|
- receiver: email-admins
|
||||||
|
object_matchers:
|
||||||
|
- [severity, "=", "critical"]
|
||||||
|
group_wait: 30s
|
||||||
|
group_interval: 5m
|
||||||
|
repeat_interval: 2h
|
||||||
|
- receiver: email-admins
|
||||||
|
object_matchers:
|
||||||
|
- [severity, "=", "warning"]
|
||||||
|
group_wait: 5m
|
||||||
|
group_interval: 2h
|
||||||
|
repeat_interval: 24h
|
||||||
rules.yaml: |
|
rules.yaml: |
|
||||||
apiVersion: 1
|
apiVersion: 1
|
||||||
groups:
|
groups:
|
||||||
@ -145,7 +162,7 @@ data:
|
|||||||
model:
|
model:
|
||||||
intervalMs: 60000
|
intervalMs: 60000
|
||||||
maxDataPoints: 43200
|
maxDataPoints: 43200
|
||||||
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
|
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
|
||||||
legendFormat: '{{instance}}'
|
legendFormat: '{{instance}}'
|
||||||
datasource:
|
datasource:
|
||||||
type: prometheus
|
type: prometheus
|
||||||
@ -286,8 +303,8 @@ data:
|
|||||||
summary: "node-image-sweeper not fully ready"
|
summary: "node-image-sweeper not fully ready"
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- uid: maint-cron-stale
|
- uid: maint-ariadne-image-sweeper-stale
|
||||||
title: "Maintenance CronJobs stale (>3h since success)"
|
title: "Ariadne image sweeper stale (schedule >8d)"
|
||||||
condition: C
|
condition: C
|
||||||
for: "5m"
|
for: "5m"
|
||||||
data:
|
data:
|
||||||
@ -297,10 +314,10 @@ data:
|
|||||||
to: 0
|
to: 0
|
||||||
datasourceUid: atlas-vm
|
datasourceUid: atlas-vm
|
||||||
model:
|
model:
|
||||||
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
|
expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
|
||||||
intervalMs: 60000
|
intervalMs: 60000
|
||||||
maxDataPoints: 43200
|
maxDataPoints: 43200
|
||||||
legendFormat: '{{cronjob}}'
|
legendFormat: '{{task}}'
|
||||||
datasource:
|
datasource:
|
||||||
type: prometheus
|
type: prometheus
|
||||||
uid: atlas-vm
|
uid: atlas-vm
|
||||||
@ -321,17 +338,166 @@ data:
|
|||||||
type: threshold
|
type: threshold
|
||||||
conditions:
|
conditions:
|
||||||
- evaluator:
|
- evaluator:
|
||||||
params: [10800]
|
params: [691200]
|
||||||
type: gt
|
type: gt
|
||||||
operator:
|
operator:
|
||||||
type: and
|
type: and
|
||||||
reducer:
|
reducer:
|
||||||
type: last
|
type: last
|
||||||
type: query
|
type: query
|
||||||
noDataState: NoData
|
noDataState: OK
|
||||||
execErrState: Error
|
execErrState: Error
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Maintenance cronjob stale >3h since last success"
|
summary: "Ariadne image sweeper stale >8d since last success"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- uid: maint-cron-stale
|
||||||
|
title: "Maintenance CronJobs stale (legacy disabled)"
|
||||||
|
condition: C
|
||||||
|
for: "5m"
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 300
|
||||||
|
to: 0
|
||||||
|
datasourceUid: atlas-vm
|
||||||
|
model:
|
||||||
|
expr: vector(0)
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
legendFormat: legacy
|
||||||
|
datasource:
|
||||||
|
type: prometheus
|
||||||
|
uid: atlas-vm
|
||||||
|
- refId: B
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: A
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
reducer: last
|
||||||
|
type: reduce
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: B
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
type: threshold
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params: [1]
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
reducer:
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: OK
|
||||||
|
annotations:
|
||||||
|
summary: "Legacy cronjob alert disabled"
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
- orgId: 1
|
||||||
|
name: ariadne
|
||||||
|
folder: Alerts
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: ariadne-schedule-error
|
||||||
|
title: "Ariadne schedule task failed"
|
||||||
|
condition: C
|
||||||
|
for: "10m"
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 300
|
||||||
|
to: 0
|
||||||
|
datasourceUid: atlas-vm
|
||||||
|
model:
|
||||||
|
expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"})
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
legendFormat: '{{task}}'
|
||||||
|
datasource:
|
||||||
|
type: prometheus
|
||||||
|
uid: atlas-vm
|
||||||
|
- refId: B
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: A
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
reducer: last
|
||||||
|
type: reduce
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: B
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
type: threshold
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params: [1]
|
||||||
|
type: lt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
reducer:
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
annotations:
|
||||||
|
summary: "Ariadne schedule failed ({{ $labels.task }})"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- uid: ariadne-scheduler-stalled
|
||||||
|
title: "Ariadne scheduler behind (>15m)"
|
||||||
|
condition: C
|
||||||
|
for: "10m"
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 300
|
||||||
|
to: 0
|
||||||
|
datasourceUid: atlas-vm
|
||||||
|
model:
|
||||||
|
expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
legendFormat: '{{task}}'
|
||||||
|
datasource:
|
||||||
|
type: prometheus
|
||||||
|
uid: atlas-vm
|
||||||
|
- refId: B
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: A
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
reducer: last
|
||||||
|
type: reduce
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: B
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
type: threshold
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params: [900]
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
reducer:
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
annotations:
|
||||||
|
summary: "Ariadne scheduler behind for {{ $labels.task }}"
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
@ -352,7 +518,7 @@ data:
|
|||||||
model:
|
model:
|
||||||
intervalMs: 60000
|
intervalMs: 60000
|
||||||
maxDataPoints: 43200
|
maxDataPoints: 43200
|
||||||
expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}
|
expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
|
||||||
legendFormat: bounce 1d
|
legendFormat: bounce 1d
|
||||||
datasource:
|
datasource:
|
||||||
type: prometheus
|
type: prometheus
|
||||||
@ -381,7 +547,7 @@ data:
|
|||||||
reducer:
|
reducer:
|
||||||
type: last
|
type: last
|
||||||
type: query
|
type: query
|
||||||
noDataState: NoData
|
noDataState: OK
|
||||||
execErrState: Error
|
execErrState: Error
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Postmark 1d bounce rate >5%"
|
summary: "Postmark 1d bounce rate >5%"
|
||||||
@ -400,7 +566,7 @@ data:
|
|||||||
model:
|
model:
|
||||||
intervalMs: 60000
|
intervalMs: 60000
|
||||||
maxDataPoints: 43200
|
maxDataPoints: 43200
|
||||||
expr: POSTMARK_API_UP
|
expr: max(postmark_api_up) or on() vector(0)
|
||||||
legendFormat: api up
|
legendFormat: api up
|
||||||
datasource:
|
datasource:
|
||||||
type: prometheus
|
type: prometheus
|
||||||
@ -429,7 +595,7 @@ data:
|
|||||||
reducer:
|
reducer:
|
||||||
type: last
|
type: last
|
||||||
type: query
|
type: query
|
||||||
noDataState: NoData
|
noDataState: OK
|
||||||
execErrState: Error
|
execErrState: Error
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Postmark exporter reports API down"
|
summary: "Postmark exporter reports API down"
|
||||||
|
|||||||
@ -286,7 +286,7 @@ spec:
|
|||||||
podAnnotations:
|
podAnnotations:
|
||||||
vault.hashicorp.com/agent-inject: "true"
|
vault.hashicorp.com/agent-inject: "true"
|
||||||
vault.hashicorp.com/role: "monitoring"
|
vault.hashicorp.com/role: "monitoring"
|
||||||
monitoring.bstein.dev/restart-rev: "1"
|
monitoring.bstein.dev/restart-rev: "5"
|
||||||
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
|
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
|
||||||
vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
|
vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
|
||||||
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
|
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user