monitoring: maintenance panels, extra alerts, update overview

This commit is contained in:
Brad Stein 2026-01-11 02:28:39 -03:00
parent 33b89c7dc2
commit 54358df569
4 changed files with 530 additions and 0 deletions

View File

@ -1232,6 +1232,50 @@ def build_overview():
links=link_to("atlas-storage"), links=link_to("atlas-storage"),
) )
) )
panels.append(
stat_panel(
30,
"Maintenance Sweepers Ready",
'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100',
{"h": 6, "w": 8, "x": 0, "y": 80},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
)
)
panels.append(
stat_panel(
31,
"Maintenance Cron Freshness (s)",
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
{"h": 6, "w": 8, "x": 8, "y": 80},
unit="s",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 3600},
{"color": "red", "value": 10800},
],
},
)
)
panels.append(
stat_panel(
32,
"Postmark Bounce Rate (1d)",
'POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}',
{"h": 6, "w": 8, "x": 16, "y": 80},
unit="percent",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 2},
{"color": "red", "value": 5},
],
},
)
)
return { return {
"uid": "atlas-overview", "uid": "atlas-overview",

View File

@ -2160,6 +2160,202 @@
} }
} }
] ]
},
{
"id": 30,
"type": "stat",
"title": "Maintenance Sweepers Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"y": 80
},
"targets": [
{
"expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 91.5
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 31,
"type": "stat",
"title": "Maintenance Cron Freshness (s)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"y": 80
},
"targets": [
{
"expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 3600
},
{
"color": "red",
"value": 10800
}
]
},
"unit": "s",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 32,
"type": "stat",
"title": "Postmark Bounce Rate (1d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 16,
"y": 80
},
"targets": [
{
"expr": "POSTMARK_OUTBOUND_BOUNCE_RATE{window=\"1d\"}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 2
},
{
"color": "red",
"value": 5
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
} }
], ],
"schemaVersion": 39, "schemaVersion": 39,

View File

@ -78,6 +78,53 @@ data:
summary: "{{ $labels.node }} rootfs >80% for 10m" summary: "{{ $labels.node }} rootfs >80% for 10m"
labels: labels:
severity: warning severity: warning
- uid: disk-growth-1h
title: "Node rootfs growing fast (>1Gi in 1h)"
condition: C
data:
- refId: A
relativeTimeRange:
from: 3600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: increase((node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})[1h]) / 1024 / 1024 / 1024
legendFormat: '{{instance}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "{{ $labels.instance }} rootfs grew >1Gi in the last hour"
labels:
severity: warning
- orgId: 1 - orgId: 1
name: maintenance name: maintenance
folder: Alerts folder: Alerts
@ -130,6 +177,53 @@ data:
summary: "node-image-sweeper not fully ready" summary: "node-image-sweeper not fully ready"
labels: labels:
severity: warning severity: warning
- uid: maint-cron-stale
title: "Maintenance CronJobs stale (>3h since success)"
condition: C
data:
- refId: A
relativeTimeRange:
from: 0
to: 0
datasourceUid: atlas-vm
model:
expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})
intervalMs: 60000
maxDataPoints: 43200
legendFormat: '{{cronjob}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [10800]
type: gt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "Maintenance cronjob stale >3h since last success"
labels:
severity: warning
- orgId: 1 - orgId: 1
name: postmark name: postmark
folder: Alerts folder: Alerts

View File

@ -2169,6 +2169,202 @@ data:
} }
} }
] ]
},
{
"id": 30,
"type": "stat",
"title": "Maintenance Sweepers Ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"y": 80
},
"targets": [
{
"expr": "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 91.5
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 31,
"type": "stat",
"title": "Maintenance Cron Freshness (s)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"y": 80
},
"targets": [
{
"expr": "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=~\"image-sweeper|grafana-smtp-sync\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 3600
},
{
"color": "red",
"value": 10800
}
]
},
"unit": "s",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 32,
"type": "stat",
"title": "Postmark Bounce Rate (1d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 6,
"w": 8,
"x": 16,
"y": 80
},
"targets": [
{
"expr": "POSTMARK_OUTBOUND_BOUNCE_RATE{window=\"1d\"}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 2
},
{
"color": "red",
"value": 5
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
} }
], ],
"schemaVersion": 39, "schemaVersion": 39,