monitoring: tune cpu and maintenance alerts
This commit is contained in:
parent
19d10ce585
commit
c5a7eece35
@ -145,7 +145,7 @@ data:
|
|||||||
model:
|
model:
|
||||||
intervalMs: 60000
|
intervalMs: 60000
|
||||||
maxDataPoints: 43200
|
maxDataPoints: 43200
|
||||||
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m]
|
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
|
||||||
legendFormat: '{{instance}}'
|
legendFormat: '{{instance}}'
|
||||||
datasource:
|
datasource:
|
||||||
type: prometheus
|
type: prometheus
|
||||||
@ -175,9 +175,9 @@ data:
|
|||||||
type: last
|
type: last
|
||||||
type: query
|
type: query
|
||||||
noDataState: NoData
|
noDataState: NoData
|
||||||
execErrState: Error
|
execErrState: NoData
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ $labels.instance }} CPU >90% for 10m"
|
summary: "{{ $labels.node }} CPU >90% for 10m"
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
@ -297,7 +297,7 @@ data:
|
|||||||
to: 0
|
to: 0
|
||||||
datasourceUid: atlas-vm
|
datasourceUid: atlas-vm
|
||||||
model:
|
model:
|
||||||
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"})
|
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
|
||||||
intervalMs: 60000
|
intervalMs: 60000
|
||||||
maxDataPoints: 43200
|
maxDataPoints: 43200
|
||||||
legendFormat: '{{cronjob}}'
|
legendFormat: '{{cronjob}}'
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user