monitoring: tune cpu and maintenance alerts

This commit is contained in:
Brad Stein 2026-01-27 23:23:42 -03:00
parent 19d10ce585
commit c5a7eece35

View File

@ -145,7 +145,7 @@ data:
model: model:
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
legendFormat: '{{instance}}' legendFormat: '{{instance}}'
datasource: datasource:
type: prometheus type: prometheus
@ -175,9 +175,9 @@ data:
type: last type: last
type: query type: query
noDataState: NoData noDataState: NoData
execErrState: Error execErrState: NoData
annotations: annotations:
summary: "{{ $labels.instance }} CPU >90% for 10m" summary: "{{ $labels.node }} CPU >90% for 10m"
labels: labels:
severity: warning severity: warning
- orgId: 1 - orgId: 1
@ -297,7 +297,7 @@ data:
to: 0 to: 0
datasourceUid: atlas-vm datasourceUid: atlas-vm
model: model:
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
legendFormat: '{{cronjob}}' legendFormat: '{{cronjob}}'