maintenance/monitoring: wire reciprocal metis hecate key + dampen alert flapping
This commit is contained in:
parent
8d1be9672c
commit
3ce7b2eeb7
@ -30,6 +30,7 @@ spec:
|
||||
export METIS_SSH_KEY_BASTION="{{ .Data.data.bastion_pub }}"
|
||||
export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}"
|
||||
export METIS_SSH_KEY_HECATE_TETHYS="{{ .Data.data.hecate_tethys_pub }}"
|
||||
export METIS_SSH_KEY_HECATE_DB="{{ .Data.data.hecate_db_pub }}"
|
||||
{{ end }}
|
||||
vault.hashicorp.com/agent-inject-template-metis-runtime-env.sh: |
|
||||
{{ with secret "kv/data/atlas/maintenance/metis-runtime" }}
|
||||
|
||||
@ -32,14 +32,14 @@ data:
|
||||
object_matchers:
|
||||
- [severity, "=", "critical"]
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 2h
|
||||
group_interval: 15m
|
||||
repeat_interval: 4h
|
||||
- receiver: email-admins
|
||||
object_matchers:
|
||||
- [severity, "=", "warning"]
|
||||
group_wait: 5m
|
||||
group_interval: 2h
|
||||
repeat_interval: 24h
|
||||
group_wait: 10m
|
||||
group_interval: 4h
|
||||
repeat_interval: 48h
|
||||
rules.yaml: |
|
||||
apiVersion: 1
|
||||
groups:
|
||||
@ -97,9 +97,9 @@ data:
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: disk-growth-1h
|
||||
title: "Node rootfs growing fast (>1Gi in 1h)"
|
||||
title: "Node rootfs growing fast (>3Gi in 1h)"
|
||||
condition: C
|
||||
for: "10m"
|
||||
for: "30m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
@ -109,8 +109,8 @@ data:
|
||||
model:
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
expr: increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024
|
||||
legendFormat: '{{instance}}'
|
||||
expr: max by (instance, node) ((increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))
|
||||
legendFormat: '{{node}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
@ -131,7 +131,7 @@ data:
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [1]
|
||||
params: [3]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
@ -141,7 +141,7 @@ data:
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "{{ $labels.instance }} rootfs grew >1Gi in the last hour"
|
||||
summary: "{{ $labels.node }} rootfs grew >3Gi in the last hour"
|
||||
labels:
|
||||
severity: warning
|
||||
- orgId: 1
|
||||
@ -150,9 +150,9 @@ data:
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: cpu-high-10m
|
||||
title: "Node CPU high (>90% for 10m)"
|
||||
title: "Node CPU high (>95% for 20m)"
|
||||
condition: C
|
||||
for: 10m
|
||||
for: 20m
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
@ -162,8 +162,8 @@ data:
|
||||
model:
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
|
||||
legendFormat: '{{instance}}'
|
||||
expr: ((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
|
||||
legendFormat: '{{node}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
@ -184,7 +184,7 @@ data:
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [90]
|
||||
params: [95]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
@ -194,7 +194,7 @@ data:
|
||||
noDataState: NoData
|
||||
execErrState: OK
|
||||
annotations:
|
||||
summary: "{{ $labels.node }} CPU >90% for 10m"
|
||||
summary: "{{ $labels.node }} CPU >95% for 20m"
|
||||
labels:
|
||||
severity: warning
|
||||
- orgId: 1
|
||||
@ -455,7 +455,7 @@ data:
|
||||
- uid: ariadne-schedule-error
|
||||
title: "Ariadne schedule task failed"
|
||||
condition: C
|
||||
for: "10m"
|
||||
for: "15m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
@ -463,7 +463,7 @@ data:
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"})
|
||||
expr: max by (task) (((time() - ariadne_schedule_last_success_timestamp_seconds{task=~"schedule\\..+"}) * on(task) group_left() (1 - ariadne_schedule_last_status{task=~"schedule\\..+"})))
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
legendFormat: '{{task}}'
|
||||
@ -487,8 +487,8 @@ data:
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [1]
|
||||
type: lt
|
||||
params: [3600]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
reducer:
|
||||
@ -497,7 +497,7 @@ data:
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Ariadne schedule failed ({{ $labels.task }})"
|
||||
summary: "Ariadne schedule has failed for >1h ({{ $labels.task }})"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: ariadne-scheduler-stalled
|
||||
@ -604,7 +604,7 @@ data:
|
||||
- uid: postmark-api-down
|
||||
title: "Postmark exporter down"
|
||||
condition: C
|
||||
for: "5m"
|
||||
for: "20m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
@ -614,7 +614,7 @@ data:
|
||||
model:
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
expr: max(postmark_api_up) or on() vector(0)
|
||||
expr: avg_over_time(postmark_api_up[15m]) or on() vector(0)
|
||||
legendFormat: api up
|
||||
datasource:
|
||||
type: prometheus
|
||||
@ -643,9 +643,9 @@ data:
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: OK
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Postmark exporter reports API down"
|
||||
summary: "Postmark exporter reports sustained API outage"
|
||||
labels:
|
||||
severity: critical
|
||||
severity: warning
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user