diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index 8713d3d..d97db15 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -145,7 +145,7 @@ data: model: intervalMs: 60000 maxDataPoints: 43200 - expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] + expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\") legendFormat: '{{instance}}' datasource: type: prometheus @@ -175,9 +175,9 @@ data: type: last type: query noDataState: NoData - execErrState: Error + execErrState: NoData annotations: - summary: "{{ $labels.instance }} CPU >90% for 10m" + summary: "{{ $labels.node }} CPU >90% for 10m" labels: severity: warning - orgId: 1 @@ -297,7 +297,7 @@ data: to: 0 datasourceUid: atlas-vm model: - expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) + expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0) intervalMs: 60000 maxDataPoints: 43200 legendFormat: '{{cronjob}}'