diff --git a/services/maintenance/metis-deployment.yaml b/services/maintenance/metis-deployment.yaml index 7524b840..751f1615 100644 --- a/services/maintenance/metis-deployment.yaml +++ b/services/maintenance/metis-deployment.yaml @@ -30,6 +30,7 @@ spec: export METIS_SSH_KEY_BASTION="{{ .Data.data.bastion_pub }}" export METIS_SSH_KEY_BRAD="{{ .Data.data.brad_pub }}" export METIS_SSH_KEY_HECATE_TETHYS="{{ .Data.data.hecate_tethys_pub }}" + export METIS_SSH_KEY_HECATE_DB="{{ .Data.data.hecate_db_pub }}" {{ end }} vault.hashicorp.com/agent-inject-template-metis-runtime-env.sh: | {{ with secret "kv/data/atlas/maintenance/metis-runtime" }} diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index 934a0721..d7b39a2b 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -32,14 +32,14 @@ data: object_matchers: - [severity, "=", "critical"] group_wait: 30s - group_interval: 5m - repeat_interval: 2h + group_interval: 15m + repeat_interval: 4h - receiver: email-admins object_matchers: - [severity, "=", "warning"] - group_wait: 5m - group_interval: 2h - repeat_interval: 24h + group_wait: 10m + group_interval: 4h + repeat_interval: 48h rules.yaml: | apiVersion: 1 groups: @@ -97,9 +97,9 @@ data: labels: severity: warning - uid: disk-growth-1h - title: "Node rootfs growing fast (>1Gi in 1h)" + title: "Node rootfs growing fast (>3Gi in 1h)" condition: C - for: "10m" + for: "30m" data: - refId: A relativeTimeRange: @@ -109,8 +109,8 @@ data: model: intervalMs: 60000 maxDataPoints: 43200 - expr: increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024 - legendFormat: '{{instance}}' + expr: max by (instance, node) ((increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")) + legendFormat: '{{node}}' datasource: type: prometheus uid: atlas-vm @@ -131,7 +131,7 @@ data: type: threshold conditions: - evaluator: - params: [1] + params: [3] type: gt operator: type: and @@ -141,7 +141,7 @@ data: noDataState: NoData execErrState: Error annotations: - summary: "{{ $labels.instance }} rootfs grew >1Gi in the last hour" + summary: "{{ $labels.node }} rootfs grew >3Gi in the last hour" labels: severity: warning - orgId: 1 @@ -150,9 +150,9 @@ data: interval: 1m rules: - uid: cpu-high-10m - title: "Node CPU high (>90% for 10m)" + title: "Node CPU high (>95% for 20m)" condition: C - for: 10m + for: 20m data: - refId: A relativeTimeRange: @@ -162,8 +162,8 @@ data: model: intervalMs: 60000 maxDataPoints: 43200 - expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)") - legendFormat: '{{instance}}' + expr: ((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)") + legendFormat: '{{node}}' datasource: type: prometheus uid: atlas-vm @@ -184,7 +184,7 @@ data: type: threshold conditions: - evaluator: - params: [90] + params: [95] type: gt operator: type: and @@ -194,7 +194,7 @@ data: noDataState: NoData execErrState: OK annotations: - summary: "{{ $labels.node }} CPU >90% for 10m" + summary: "{{ $labels.node }} CPU >95% for 20m" labels: severity: warning - orgId: 1 @@ -455,7 +455,7 @@ data: - uid: ariadne-schedule-error title: "Ariadne schedule task failed" condition: C - for: "10m" + for: "15m" data: - refId: A relativeTimeRange: @@ -463,7 +463,7 @@ data: to: 0 datasourceUid: atlas-vm model: - expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"}) + expr: max by (task) (((time() - ariadne_schedule_last_success_timestamp_seconds{task=~"schedule\\..+"}) * on(task) group_left() (1 - ariadne_schedule_last_status{task=~"schedule\\..+"}))) intervalMs: 60000 maxDataPoints: 43200 legendFormat: '{{task}}' @@ -487,8 +487,8 @@ data: type: threshold conditions: - evaluator: - params: [1] - type: lt + params: [3600] + type: gt operator: type: and reducer: @@ -497,7 +497,7 @@ data: noDataState: OK execErrState: Error annotations: - summary: "Ariadne schedule failed ({{ $labels.task }})" + summary: "Ariadne schedule has failed for >1h ({{ $labels.task }})" labels: severity: warning - uid: ariadne-scheduler-stalled @@ -604,7 +604,7 @@ data: - uid: postmark-api-down title: "Postmark exporter down" condition: C - for: "5m" + for: "20m" data: - refId: A relativeTimeRange: @@ -614,7 +614,7 @@ data: model: intervalMs: 60000 maxDataPoints: 43200 - expr: max(postmark_api_up) or on() vector(0) + expr: avg_over_time(postmark_api_up[15m]) or on() vector(0) legendFormat: api up datasource: type: prometheus @@ -643,9 +643,9 @@ data: reducer: type: last type: query - noDataState: OK + noDataState: NoData execErrState: Error annotations: - summary: "Postmark exporter reports API down" + summary: "Postmark exporter reports sustained API outage" labels: - severity: critical + severity: warning