From a87a5f7bff4ee18588f93033bf4feb173e6f7c0d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 12 Apr 2026 14:56:34 -0300 Subject: [PATCH] monitoring: fix typhon low-threshold alert semantics --- .../monitoring/grafana-alerting-config.yaml | 441 +++++++++++++++++- 1 file changed, 439 insertions(+), 2 deletions(-) diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index 0dd06bf5..be765f73 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -131,7 +131,7 @@ data: type: threshold conditions: - evaluator: - params: [3] + params: [2] type: gt operator: type: and @@ -578,7 +578,7 @@ data: type: threshold conditions: - evaluator: - params: [10] + params: [9] type: gt operator: type: and @@ -793,3 +793,440 @@ data: summary: "Postmark exporter reports sustained API outage" labels: severity: warning + - orgId: 1 + name: typhon + folder: Alerts + interval: 1m + rules: + - uid: typhon-exporter-down + title: "Typhon exporter down (>10m)" + condition: C + for: "10m" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: max(typhon_up) or on() vector(0) + legendFormat: typhon_up + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + reducer: + type: last + type: query + noDataState: Alerting + execErrState: Alerting + annotations: + summary: "Typhon has been down for >10m" + labels: + severity: critical + - uid: typhon-data-stale + title: "Typhon data stale (>180s for 10m)" + condition: C + for: "10m" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: max(typhon_data_age_seconds) or on() vector(0) + legendFormat: data age + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [180] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: NoData + execErrState: Error + annotations: + summary: "Typhon data age >180s for >10m" + labels: + severity: warning + - uid: typhon-auth-failures + title: "Typhon auth failures burst" + condition: C + for: "5m" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: sum(increase(typhon_poll_errors_total{reason=\"auth\"}[10m])) or on() vector(0) + legendFormat: auth failures 10m + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [3] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: NoData + execErrState: Error + annotations: + summary: "Typhon auth failures exceeded threshold in 10m" + labels: + severity: critical + - uid: typhon-api-errors + title: "Typhon API/timeouts burst" + condition: C + for: "15m" + data: + - refId: A + relativeTimeRange: + from: 900 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: sum(increase(typhon_poll_errors_total{reason=~\"api|timeout|unknown\"}[15m])) or on() vector(0) + legendFormat: poll errors 15m + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [10] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: Error + annotations: + summary: "Typhon API/timeouts exceeded threshold in 15m" + labels: + severity: warning + - uid: typhon-temp-critical + title: "Tent temperature critical (>34C)" + condition: C + for: "10m" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: max(typhon_temperature_celsius) or on() vector(0) + legendFormat: max temp + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [34] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: Error + annotations: + summary: "Typhon reports tent temperature >34C for >10m" + labels: + severity: critical + - uid: typhon-humidity-high + title: "Tent humidity high (>75%)" + condition: C + for: "20m" + data: + - refId: A + relativeTimeRange: + from: 1200 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: max(typhon_relative_humidity_percent) or on() vector(0) + legendFormat: max humidity + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [75] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: Error + annotations: + summary: "Typhon reports relative humidity >75% for >20m" + labels: + severity: warning + - uid: typhon-humidity-low + title: "Tent humidity low (<30%)" + condition: C + for: "20m" + data: + - refId: A + relativeTimeRange: + from: 1200 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: min(typhon_relative_humidity_percent) + legendFormat: min humidity + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [30] + type: lt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: Error + annotations: + summary: "Typhon reports relative humidity <30% for >20m" + labels: + severity: warning + - uid: typhon-vpd-high + title: "Tent VPD high (>2.0 kPa)" + condition: C + for: "20m" + data: + - refId: A + relativeTimeRange: + from: 1200 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: max(typhon_vpd_kpa) or on() vector(0) + legendFormat: max vpd + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [2.0] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: Error + annotations: + summary: "Typhon reports VPD >2.0 kPa for >20m" + labels: + severity: warning + - uid: typhon-vpd-low + title: "Tent VPD low (<0.4 kPa)" + condition: C + for: "20m" + data: + - refId: A + relativeTimeRange: + from: 1200 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: min(typhon_vpd_kpa) + legendFormat: min vpd + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [0.4] + type: lt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: Error + annotations: + summary: "Typhon reports VPD <0.4 kPa for >20m" + labels: + severity: warning