Compare commits
6 Commits
e93aa6e33b
...
6c3c1342cd
| Author | SHA1 | Date | |
|---|---|---|---|
| 6c3c1342cd | |||
| 7b43043838 | |||
| af74172b2d | |||
| df5ba74ab7 | |||
| 9e88b3fc88 | |||
| ca273c7337 |
@ -423,16 +423,17 @@ ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
|
||||
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
|
||||
)
|
||||
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
|
||||
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
|
||||
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
|
||||
ARIADNE_TEST_SUCCESS_RATE = (
|
||||
TEST_REPO_SELECTOR = 'repo=~"ariadne|metis"'
|
||||
TEST_CI_COVERAGE = f'ariadne_ci_coverage_percent{{{TEST_REPO_SELECTOR}}}'
|
||||
TEST_CI_TESTS = f'ariadne_ci_tests_total{{{TEST_REPO_SELECTOR}}}'
|
||||
TEST_SUCCESS_RATE = (
|
||||
"100 * "
|
||||
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
|
||||
f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result="passed"}}[30d])) '
|
||||
"/ clamp_min("
|
||||
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
|
||||
f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"passed|failed|error"}}[30d])), 1)'
|
||||
)
|
||||
ARIADNE_TEST_FAILURES_24H = (
|
||||
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
|
||||
TEST_FAILURES_24H = (
|
||||
f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))'
|
||||
)
|
||||
POSTGRES_CONN_USED = (
|
||||
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
|
||||
@ -1297,8 +1298,8 @@ def build_overview():
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
42,
|
||||
"Ariadne Test Success Rate",
|
||||
ARIADNE_TEST_SUCCESS_RATE,
|
||||
"Ariadne + Metis Test Success Rate",
|
||||
TEST_SUCCESS_RATE,
|
||||
{"h": 6, "w": 6, "x": 12, "y": 14},
|
||||
unit="percent",
|
||||
max_value=100,
|
||||
@ -1309,8 +1310,8 @@ def build_overview():
|
||||
panels.append(
|
||||
bargauge_panel(
|
||||
43,
|
||||
"Tests with Failures (24h)",
|
||||
ARIADNE_TEST_FAILURES_24H,
|
||||
"Ariadne + Metis Tests with Failures (24h)",
|
||||
TEST_FAILURES_24H,
|
||||
{"h": 6, "w": 6, "x": 18, "y": 14},
|
||||
unit="none",
|
||||
instant=True,
|
||||
@ -2656,8 +2657,8 @@ def build_jobs_dashboard():
|
||||
panels.append(
|
||||
stat_panel(
|
||||
17,
|
||||
"Ariadne CI Coverage (%)",
|
||||
ARIADNE_CI_COVERAGE,
|
||||
"Ariadne + Metis CI Coverage (%)",
|
||||
TEST_CI_COVERAGE,
|
||||
{"h": 6, "w": 4, "x": 8, "y": 11},
|
||||
unit="percent",
|
||||
decimals=1,
|
||||
@ -2668,8 +2669,8 @@ def build_jobs_dashboard():
|
||||
panels.append(
|
||||
table_panel(
|
||||
18,
|
||||
"Ariadne CI Tests (latest)",
|
||||
ARIADNE_CI_TESTS,
|
||||
"Ariadne + Metis CI Tests (latest)",
|
||||
TEST_CI_TESTS,
|
||||
{"h": 6, "w": 12, "x": 12, "y": 11},
|
||||
unit="none",
|
||||
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
|
||||
|
||||
@ -10,6 +10,8 @@ spec:
|
||||
app: node-image-sweeper
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxUnavailable: 100%
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
@ -29,6 +31,21 @@ spec:
|
||||
- name: node-image-sweeper
|
||||
image: python:3.12.9-alpine3.20
|
||||
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
|
||||
env:
|
||||
- name: SWEEP_INTERVAL_SEC
|
||||
value: "21600"
|
||||
- name: HIGH_USAGE_PERCENT
|
||||
value: "70"
|
||||
- name: EMERGENCY_USAGE_PERCENT
|
||||
value: "80"
|
||||
- name: BASE_THRESHOLD_DAYS
|
||||
value: "14"
|
||||
- name: HIGH_USAGE_THRESHOLD_DAYS
|
||||
value: "3"
|
||||
- name: LOG_RETENTION_DAYS
|
||||
value: "7"
|
||||
- name: JOURNAL_MAX_SIZE
|
||||
value: "200M"
|
||||
securityContext:
|
||||
privileged: true
|
||||
runAsUser: 0
|
||||
|
||||
@ -2,26 +2,39 @@
|
||||
set -eu
|
||||
|
||||
ONE_SHOT=${ONE_SHOT:-false}
|
||||
THRESHOLD_DAYS=14
|
||||
SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
|
||||
BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
|
||||
HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
|
||||
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
|
||||
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
|
||||
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
|
||||
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
|
||||
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
|
||||
|
||||
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
|
||||
if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then
|
||||
THRESHOLD_DAYS=3
|
||||
fi
|
||||
sweep_once() {
|
||||
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
|
||||
threshold_days="${BASE_THRESHOLD_DAYS}"
|
||||
if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
|
||||
threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
|
||||
fi
|
||||
|
||||
cutoff=$(python3 - <<'PY'
|
||||
import time, os
|
||||
print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400)
|
||||
cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
|
||||
import os
|
||||
import time
|
||||
|
||||
days = int(os.environ.get("THRESHOLD_DAYS", "14"))
|
||||
print(int(time.time()) - days * 86400)
|
||||
PY
|
||||
)
|
||||
|
||||
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
|
||||
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
|
||||
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
|
||||
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
|
||||
|
||||
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
|
||||
|
||||
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
|
||||
import json, os, sys, time
|
||||
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
@ -74,19 +87,33 @@ for p in prune:
|
||||
PY
|
||||
)
|
||||
|
||||
if [ -n "${prune_list}" ]; then
|
||||
printf "%s" "${prune_list}" | while read -r image_id; do
|
||||
if [ -n "${image_id}" ]; then
|
||||
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
|
||||
fi
|
||||
done
|
||||
fi
|
||||
if [ -n "${prune_list}" ]; then
|
||||
printf "%s" "${prune_list}" | while read -r image_id; do
|
||||
if [ -n "${image_id}" ]; then
|
||||
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
|
||||
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
|
||||
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
|
||||
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
|
||||
|
||||
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
|
||||
# Emergency pass for rootfs pressure on SD-backed nodes.
|
||||
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
|
||||
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
|
||||
fi
|
||||
}
|
||||
|
||||
sweep_once
|
||||
|
||||
if [ "${ONE_SHOT}" = "true" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep infinity
|
||||
while true; do
|
||||
sleep "${SWEEP_INTERVAL_SEC}"
|
||||
sweep_once
|
||||
done
|
||||
|
||||
@ -1125,7 +1125,7 @@
|
||||
{
|
||||
"id": 17,
|
||||
"type": "stat",
|
||||
"title": "Ariadne CI Coverage (%)",
|
||||
"title": "Ariadne + Metis CI Coverage (%)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1138,7 +1138,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
|
||||
"expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{branch}}",
|
||||
"instant": true
|
||||
@ -1188,7 +1188,7 @@
|
||||
{
|
||||
"id": 18,
|
||||
"type": "table",
|
||||
"title": "Ariadne CI Tests (latest)",
|
||||
"title": "Ariadne + Metis CI Tests (latest)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1201,7 +1201,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
|
||||
"expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
|
||||
@ -1677,7 +1677,7 @@
|
||||
{
|
||||
"id": 42,
|
||||
"type": "timeseries",
|
||||
"title": "Ariadne Test Success Rate",
|
||||
"title": "Ariadne + Metis Test Success Rate",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1690,7 +1690,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
|
||||
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -1714,7 +1714,7 @@
|
||||
{
|
||||
"id": 43,
|
||||
"type": "bargauge",
|
||||
"title": "Tests with Failures (24h)",
|
||||
"title": "Ariadne + Metis Tests with Failures (24h)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1727,7 +1727,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
|
||||
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{result}}",
|
||||
"instant": true
|
||||
|
||||
@ -22,7 +22,24 @@ data:
|
||||
- orgId: 1
|
||||
receiver: email-admins
|
||||
group_by:
|
||||
- grafana_folder
|
||||
- alertname
|
||||
group_wait: 1m
|
||||
group_interval: 30m
|
||||
repeat_interval: 12h
|
||||
routes:
|
||||
- receiver: email-admins
|
||||
object_matchers:
|
||||
- [severity, "=", "critical"]
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 2h
|
||||
- receiver: email-admins
|
||||
object_matchers:
|
||||
- [severity, "=", "warning"]
|
||||
group_wait: 5m
|
||||
group_interval: 2h
|
||||
repeat_interval: 24h
|
||||
rules.yaml: |
|
||||
apiVersion: 1
|
||||
groups:
|
||||
@ -32,7 +49,7 @@ data:
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: disk-pressure-root
|
||||
title: "Node rootfs high (>80%)"
|
||||
title: "Node rootfs high (>85%)"
|
||||
condition: C
|
||||
for: "10m"
|
||||
data:
|
||||
@ -66,7 +83,7 @@ data:
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [80]
|
||||
params: [85]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
@ -76,7 +93,7 @@ data:
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "{{ $labels.node }} rootfs >80% for 10m"
|
||||
summary: "{{ $labels.node }} rootfs >85% for 10m"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: disk-growth-1h
|
||||
@ -501,7 +518,7 @@ data:
|
||||
model:
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
expr: postmark_outbound_bounce_rate{window="1d"}
|
||||
expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
|
||||
legendFormat: bounce 1d
|
||||
datasource:
|
||||
type: prometheus
|
||||
@ -530,7 +547,7 @@ data:
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: NoData
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Postmark 1d bounce rate >5%"
|
||||
@ -549,7 +566,7 @@ data:
|
||||
model:
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
expr: min_over_time(max by (instance) (postmark_api_up)[5m])
|
||||
expr: max(postmark_api_up) or on() vector(0)
|
||||
legendFormat: api up
|
||||
datasource:
|
||||
type: prometheus
|
||||
@ -578,7 +595,7 @@ data:
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: NoData
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Postmark exporter reports API down"
|
||||
|
||||
@ -1134,7 +1134,7 @@ data:
|
||||
{
|
||||
"id": 17,
|
||||
"type": "stat",
|
||||
"title": "Ariadne CI Coverage (%)",
|
||||
"title": "Ariadne + Metis CI Coverage (%)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1147,7 +1147,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
|
||||
"expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{branch}}",
|
||||
"instant": true
|
||||
@ -1197,7 +1197,7 @@ data:
|
||||
{
|
||||
"id": 18,
|
||||
"type": "table",
|
||||
"title": "Ariadne CI Tests (latest)",
|
||||
"title": "Ariadne + Metis CI Tests (latest)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1210,7 +1210,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
|
||||
"expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
|
||||
@ -1686,7 +1686,7 @@ data:
|
||||
{
|
||||
"id": 42,
|
||||
"type": "timeseries",
|
||||
"title": "Ariadne Test Success Rate",
|
||||
"title": "Ariadne + Metis Test Success Rate",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1699,7 +1699,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
|
||||
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -1723,7 +1723,7 @@ data:
|
||||
{
|
||||
"id": 43,
|
||||
"type": "bargauge",
|
||||
"title": "Tests with Failures (24h)",
|
||||
"title": "Ariadne + Metis Tests with Failures (24h)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1736,7 +1736,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
|
||||
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{result}}",
|
||||
"instant": true
|
||||
|
||||
@ -286,7 +286,7 @@ spec:
|
||||
podAnnotations:
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/role: "monitoring"
|
||||
monitoring.bstein.dev/restart-rev: "4"
|
||||
monitoring.bstein.dev/restart-rev: "6"
|
||||
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
|
||||
vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
|
||||
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user