diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index 424b49f1..a9024843 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -639,6 +639,54 @@ data: summary: "Soteria saw >10 authorization denials in 15m" labels: severity: warning + - uid: maint-soteria-backup-job-storm + title: "Soteria backup job creation spike" + condition: C + for: "5m" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + expr: sum(increase(kube_job_created{namespace="maintenance",job_name=~"soteria-backup-.*"}[10m])) or on() vector(0) + intervalMs: 60000 + maxDataPoints: 43200 + legendFormat: soteria-backup-jobs-created-10m + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [8] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: OK + execErrState: Alerting + annotations: + summary: "Soteria created >8 backup jobs in 10m (possible scheduler storm)" + labels: + severity: warning - orgId: 1 name: ariadne folder: Alerts