maintenance/monitoring: wire reciprocal metis hecate key + dampen alert flapping
This commit is contained in:
parent
8d1be9672c
commit
3ce7b2eeb7
@ -30,6 +30,7 @@ spec:
|
|||||||
export METIS_SSH_KEY_BASTION="{{ .Data.data.bastion_pub }}"
|
export METIS_SSH_KEY_BASTION="{{ .Data.data.bastion_pub }}"
|
||||||
export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}"
|
export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}"
|
||||||
export METIS_SSH_KEY_HECATE_TETHYS="{{ .Data.data.hecate_tethys_pub }}"
|
export METIS_SSH_KEY_HECATE_TETHYS="{{ .Data.data.hecate_tethys_pub }}"
|
||||||
|
export METIS_SSH_KEY_HECATE_DB="{{ .Data.data.hecate_db_pub }}"
|
||||||
{{ end }}
|
{{ end }}
|
||||||
vault.hashicorp.com/agent-inject-template-metis-runtime-env.sh: |
|
vault.hashicorp.com/agent-inject-template-metis-runtime-env.sh: |
|
||||||
{{ with secret "kv/data/atlas/maintenance/metis-runtime" }}
|
{{ with secret "kv/data/atlas/maintenance/metis-runtime" }}
|
||||||
|
|||||||
@ -32,14 +32,14 @@ data:
|
|||||||
object_matchers:
|
object_matchers:
|
||||||
- [severity, "=", "critical"]
|
- [severity, "=", "critical"]
|
||||||
group_wait: 30s
|
group_wait: 30s
|
||||||
group_interval: 5m
|
group_interval: 15m
|
||||||
repeat_interval: 2h
|
repeat_interval: 4h
|
||||||
- receiver: email-admins
|
- receiver: email-admins
|
||||||
object_matchers:
|
object_matchers:
|
||||||
- [severity, "=", "warning"]
|
- [severity, "=", "warning"]
|
||||||
group_wait: 5m
|
group_wait: 10m
|
||||||
group_interval: 2h
|
group_interval: 4h
|
||||||
repeat_interval: 24h
|
repeat_interval: 48h
|
||||||
rules.yaml: |
|
rules.yaml: |
|
||||||
apiVersion: 1
|
apiVersion: 1
|
||||||
groups:
|
groups:
|
||||||
@ -97,9 +97,9 @@ data:
|
|||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- uid: disk-growth-1h
|
- uid: disk-growth-1h
|
||||||
title: "Node rootfs growing fast (>1Gi in 1h)"
|
title: "Node rootfs growing fast (>3Gi in 1h)"
|
||||||
condition: C
|
condition: C
|
||||||
for: "10m"
|
for: "30m"
|
||||||
data:
|
data:
|
||||||
- refId: A
|
- refId: A
|
||||||
relativeTimeRange:
|
relativeTimeRange:
|
||||||
@ -109,8 +109,8 @@ data:
|
|||||||
model:
|
model:
|
||||||
intervalMs: 60000
|
intervalMs: 60000
|
||||||
maxDataPoints: 43200
|
maxDataPoints: 43200
|
||||||
expr: increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024
|
expr: max by (instance, node) ((increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))
|
||||||
legendFormat: '{{instance}}'
|
legendFormat: '{{node}}'
|
||||||
datasource:
|
datasource:
|
||||||
type: prometheus
|
type: prometheus
|
||||||
uid: atlas-vm
|
uid: atlas-vm
|
||||||
@ -131,7 +131,7 @@ data:
|
|||||||
type: threshold
|
type: threshold
|
||||||
conditions:
|
conditions:
|
||||||
- evaluator:
|
- evaluator:
|
||||||
params: [1]
|
params: [3]
|
||||||
type: gt
|
type: gt
|
||||||
operator:
|
operator:
|
||||||
type: and
|
type: and
|
||||||
@ -141,7 +141,7 @@ data:
|
|||||||
noDataState: NoData
|
noDataState: NoData
|
||||||
execErrState: Error
|
execErrState: Error
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ $labels.instance }} rootfs grew >1Gi in the last hour"
|
summary: "{{ $labels.node }} rootfs grew >3Gi in the last hour"
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
@ -150,9 +150,9 @@ data:
|
|||||||
interval: 1m
|
interval: 1m
|
||||||
rules:
|
rules:
|
||||||
- uid: cpu-high-10m
|
- uid: cpu-high-10m
|
||||||
title: "Node CPU high (>90% for 10m)"
|
title: "Node CPU high (>95% for 20m)"
|
||||||
condition: C
|
condition: C
|
||||||
for: 10m
|
for: 20m
|
||||||
data:
|
data:
|
||||||
- refId: A
|
- refId: A
|
||||||
relativeTimeRange:
|
relativeTimeRange:
|
||||||
@ -162,8 +162,8 @@ data:
|
|||||||
model:
|
model:
|
||||||
intervalMs: 60000
|
intervalMs: 60000
|
||||||
maxDataPoints: 43200
|
maxDataPoints: 43200
|
||||||
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
|
expr: ((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
|
||||||
legendFormat: '{{instance}}'
|
legendFormat: '{{node}}'
|
||||||
datasource:
|
datasource:
|
||||||
type: prometheus
|
type: prometheus
|
||||||
uid: atlas-vm
|
uid: atlas-vm
|
||||||
@ -184,7 +184,7 @@ data:
|
|||||||
type: threshold
|
type: threshold
|
||||||
conditions:
|
conditions:
|
||||||
- evaluator:
|
- evaluator:
|
||||||
params: [90]
|
params: [95]
|
||||||
type: gt
|
type: gt
|
||||||
operator:
|
operator:
|
||||||
type: and
|
type: and
|
||||||
@ -194,7 +194,7 @@ data:
|
|||||||
noDataState: NoData
|
noDataState: NoData
|
||||||
execErrState: OK
|
execErrState: OK
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ $labels.node }} CPU >90% for 10m"
|
summary: "{{ $labels.node }} CPU >95% for 20m"
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
@ -455,7 +455,7 @@ data:
|
|||||||
- uid: ariadne-schedule-error
|
- uid: ariadne-schedule-error
|
||||||
title: "Ariadne schedule task failed"
|
title: "Ariadne schedule task failed"
|
||||||
condition: C
|
condition: C
|
||||||
for: "10m"
|
for: "15m"
|
||||||
data:
|
data:
|
||||||
- refId: A
|
- refId: A
|
||||||
relativeTimeRange:
|
relativeTimeRange:
|
||||||
@ -463,7 +463,7 @@ data:
|
|||||||
to: 0
|
to: 0
|
||||||
datasourceUid: atlas-vm
|
datasourceUid: atlas-vm
|
||||||
model:
|
model:
|
||||||
expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"})
|
expr: max by (task) (((time() - ariadne_schedule_last_success_timestamp_seconds{task=~"schedule\\..+"}) * on(task) group_left() (1 - ariadne_schedule_last_status{task=~"schedule\\..+"})))
|
||||||
intervalMs: 60000
|
intervalMs: 60000
|
||||||
maxDataPoints: 43200
|
maxDataPoints: 43200
|
||||||
legendFormat: '{{task}}'
|
legendFormat: '{{task}}'
|
||||||
@ -487,8 +487,8 @@ data:
|
|||||||
type: threshold
|
type: threshold
|
||||||
conditions:
|
conditions:
|
||||||
- evaluator:
|
- evaluator:
|
||||||
params: [1]
|
params: [3600]
|
||||||
type: lt
|
type: gt
|
||||||
operator:
|
operator:
|
||||||
type: and
|
type: and
|
||||||
reducer:
|
reducer:
|
||||||
@ -497,7 +497,7 @@ data:
|
|||||||
noDataState: OK
|
noDataState: OK
|
||||||
execErrState: Error
|
execErrState: Error
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Ariadne schedule failed ({{ $labels.task }})"
|
summary: "Ariadne schedule has failed for >1h ({{ $labels.task }})"
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- uid: ariadne-scheduler-stalled
|
- uid: ariadne-scheduler-stalled
|
||||||
@ -604,7 +604,7 @@ data:
|
|||||||
- uid: postmark-api-down
|
- uid: postmark-api-down
|
||||||
title: "Postmark exporter down"
|
title: "Postmark exporter down"
|
||||||
condition: C
|
condition: C
|
||||||
for: "5m"
|
for: "20m"
|
||||||
data:
|
data:
|
||||||
- refId: A
|
- refId: A
|
||||||
relativeTimeRange:
|
relativeTimeRange:
|
||||||
@ -614,7 +614,7 @@ data:
|
|||||||
model:
|
model:
|
||||||
intervalMs: 60000
|
intervalMs: 60000
|
||||||
maxDataPoints: 43200
|
maxDataPoints: 43200
|
||||||
expr: max(postmark_api_up) or on() vector(0)
|
expr: avg_over_time(postmark_api_up[15m]) or on() vector(0)
|
||||||
legendFormat: api up
|
legendFormat: api up
|
||||||
datasource:
|
datasource:
|
||||||
type: prometheus
|
type: prometheus
|
||||||
@ -643,9 +643,9 @@ data:
|
|||||||
reducer:
|
reducer:
|
||||||
type: last
|
type: last
|
||||||
type: query
|
type: query
|
||||||
noDataState: OK
|
noDataState: NoData
|
||||||
execErrState: Error
|
execErrState: Error
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Postmark exporter reports API down"
|
summary: "Postmark exporter reports sustained API outage"
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: warning
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user