maintenance/monitoring: wire reciprocal metis hecate key + dampen alert flapping

This commit is contained in:
Brad Stein 2026-04-05 13:51:57 -03:00
parent 8d1be9672c
commit 3ce7b2eeb7
2 changed files with 28 additions and 27 deletions

View File

@ -30,6 +30,7 @@ spec:
export METIS_SSH_KEY_BASTION="{{ .Data.data.bastion_pub }}"
export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}"
export METIS_SSH_KEY_HECATE_TETHYS="{{ .Data.data.hecate_tethys_pub }}"
export METIS_SSH_KEY_HECATE_DB="{{ .Data.data.hecate_db_pub }}"
{{ end }}
vault.hashicorp.com/agent-inject-template-metis-runtime-env.sh: |
{{ with secret "kv/data/atlas/maintenance/metis-runtime" }}

View File

@ -32,14 +32,14 @@ data:
object_matchers:
- [severity, "=", "critical"]
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
group_interval: 15m
repeat_interval: 4h
- receiver: email-admins
object_matchers:
- [severity, "=", "warning"]
group_wait: 5m
group_interval: 2h
repeat_interval: 24h
group_wait: 10m
group_interval: 4h
repeat_interval: 48h
rules.yaml: |
apiVersion: 1
groups:
@ -97,9 +97,9 @@ data:
labels:
severity: warning
- uid: disk-growth-1h
title: "Node rootfs growing fast (>1Gi in 1h)"
title: "Node rootfs growing fast (>3Gi in 1h)"
condition: C
for: "10m"
for: "30m"
data:
- refId: A
relativeTimeRange:
@ -109,8 +109,8 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024
legendFormat: '{{instance}}'
expr: max by (instance, node) ((increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))
legendFormat: '{{node}}'
datasource:
type: prometheus
uid: atlas-vm
@ -131,7 +131,7 @@ data:
type: threshold
conditions:
- evaluator:
params: [1]
params: [3]
type: gt
operator:
type: and
@ -141,7 +141,7 @@ data:
noDataState: NoData
execErrState: Error
annotations:
summary: "{{ $labels.instance }} rootfs grew >1Gi in the last hour"
summary: "{{ $labels.node }} rootfs grew >3Gi in the last hour"
labels:
severity: warning
- orgId: 1
@ -150,9 +150,9 @@ data:
interval: 1m
rules:
- uid: cpu-high-10m
title: "Node CPU high (>90% for 10m)"
title: "Node CPU high (>95% for 20m)"
condition: C
for: 10m
for: 20m
data:
- refId: A
relativeTimeRange:
@ -162,8 +162,8 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
legendFormat: '{{instance}}'
expr: ((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
legendFormat: '{{node}}'
datasource:
type: prometheus
uid: atlas-vm
@ -184,7 +184,7 @@ data:
type: threshold
conditions:
- evaluator:
params: [90]
params: [95]
type: gt
operator:
type: and
@ -194,7 +194,7 @@ data:
noDataState: NoData
execErrState: OK
annotations:
summary: "{{ $labels.node }} CPU >90% for 10m"
summary: "{{ $labels.node }} CPU >95% for 20m"
labels:
severity: warning
- orgId: 1
@ -455,7 +455,7 @@ data:
- uid: ariadne-schedule-error
title: "Ariadne schedule task failed"
condition: C
for: "10m"
for: "15m"
data:
- refId: A
relativeTimeRange:
@ -463,7 +463,7 @@ data:
to: 0
datasourceUid: atlas-vm
model:
expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"})
expr: max by (task) (((time() - ariadne_schedule_last_success_timestamp_seconds{task=~"schedule\\..+"}) * on(task) group_left() (1 - ariadne_schedule_last_status{task=~"schedule\\..+"})))
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{task}}'
@ -487,8 +487,8 @@ data:
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
params: [3600]
type: gt
operator:
type: and
reducer:
@ -497,7 +497,7 @@ data:
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne schedule failed ({{ $labels.task }})"
summary: "Ariadne schedule has failed for >1h ({{ $labels.task }})"
labels:
severity: warning
- uid: ariadne-scheduler-stalled
@ -604,7 +604,7 @@ data:
- uid: postmark-api-down
title: "Postmark exporter down"
condition: C
for: "5m"
for: "20m"
data:
- refId: A
relativeTimeRange:
@ -614,7 +614,7 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: max(postmark_api_up) or on() vector(0)
expr: avg_over_time(postmark_api_up[15m]) or on() vector(0)
legendFormat: api up
datasource:
type: prometheus
@ -643,9 +643,9 @@ data:
reducer:
type: last
type: query
noDataState: OK
noDataState: NoData
execErrState: Error
annotations:
summary: "Postmark exporter reports API down"
summary: "Postmark exporter reports sustained API outage"
labels:
severity: critical
severity: warning