monitoring: fix typhon low-threshold alert semantics

This commit is contained in:
Brad Stein 2026-04-12 14:56:34 -03:00
parent a1c8a99866
commit a87a5f7bff

View File

@ -131,7 +131,7 @@ data:
type: threshold
conditions:
- evaluator:
params: [3]
params: [2]
type: gt
operator:
type: and
@ -578,7 +578,7 @@ data:
type: threshold
conditions:
- evaluator:
params: [10]
params: [9]
type: gt
operator:
type: and
@ -793,3 +793,440 @@ data:
summary: "Postmark exporter reports sustained API outage"
labels:
severity: warning
- orgId: 1
name: typhon
folder: Alerts
interval: 1m
rules:
- uid: typhon-exporter-down
title: "Typhon exporter down (>10m)"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: max(typhon_up) or on() vector(0)
legendFormat: typhon_up
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: Alerting
execErrState: Alerting
annotations:
summary: "Typhon has been down for >10m"
labels:
severity: critical
- uid: typhon-data-stale
title: "Typhon data stale (>180s for 10m)"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: max(typhon_data_age_seconds) or on() vector(0)
legendFormat: data age
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [180]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "Typhon data age >180s for >10m"
labels:
severity: warning
- uid: typhon-auth-failures
title: "Typhon auth failures burst"
condition: C
for: "5m"
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: sum(increase(typhon_poll_errors_total{reason=\"auth\"}[10m])) or on() vector(0)
legendFormat: auth failures 10m
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [3]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "Typhon auth failures exceeded threshold in 10m"
labels:
severity: critical
- uid: typhon-api-errors
title: "Typhon API/timeouts burst"
condition: C
for: "15m"
data:
- refId: A
relativeTimeRange:
from: 900
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: sum(increase(typhon_poll_errors_total{reason=~\"api|timeout|unknown\"}[15m])) or on() vector(0)
legendFormat: poll errors 15m
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [10]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Typhon API/timeouts exceeded threshold in 15m"
labels:
severity: warning
- uid: typhon-temp-critical
title: "Tent temperature critical (>34C)"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: max(typhon_temperature_celsius) or on() vector(0)
legendFormat: max temp
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [34]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Typhon reports tent temperature >34C for >10m"
labels:
severity: critical
- uid: typhon-humidity-high
title: "Tent humidity high (>75%)"
condition: C
for: "20m"
data:
- refId: A
relativeTimeRange:
from: 1200
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: max(typhon_relative_humidity_percent) or on() vector(0)
legendFormat: max humidity
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [75]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Typhon reports relative humidity >75% for >20m"
labels:
severity: warning
- uid: typhon-humidity-low
title: "Tent humidity low (<30%)"
condition: C
for: "20m"
data:
- refId: A
relativeTimeRange:
from: 1200
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: min(typhon_relative_humidity_percent)
legendFormat: min humidity
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [30]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Typhon reports relative humidity <30% for >20m"
labels:
severity: warning
- uid: typhon-vpd-high
title: "Tent VPD high (>2.0 kPa)"
condition: C
for: "20m"
data:
- refId: A
relativeTimeRange:
from: 1200
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: max(typhon_vpd_kpa) or on() vector(0)
legendFormat: max vpd
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [2.0]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Typhon reports VPD >2.0 kPa for >20m"
labels:
severity: warning
- uid: typhon-vpd-low
title: "Tent VPD low (<0.4 kPa)"
condition: C
for: "20m"
data:
- refId: A
relativeTimeRange:
from: 1200
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: min(typhon_vpd_kpa)
legendFormat: min vpd
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [0.4]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: OK
execErrState: Error
annotations:
summary: "Typhon reports VPD <0.4 kPa for >20m"
labels:
severity: warning