monitoring: tune cpu and maintenance alerts
This commit is contained in:
parent
19d10ce585
commit
c5a7eece35
@ -145,7 +145,7 @@ data:
|
||||
model:
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m]
|
||||
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
|
||||
legendFormat: '{{instance}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
@ -175,9 +175,9 @@ data:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
execErrState: NoData
|
||||
annotations:
|
||||
summary: "{{ $labels.instance }} CPU >90% for 10m"
|
||||
summary: "{{ $labels.node }} CPU >90% for 10m"
|
||||
labels:
|
||||
severity: warning
|
||||
- orgId: 1
|
||||
@ -297,7 +297,7 @@ data:
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"})
|
||||
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
legendFormat: '{{cronjob}}'
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user