Compare commits

..

6 Commits

9 changed files with 125 additions and 63 deletions

View File

@ -423,16 +423,17 @@ ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
)
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
ARIADNE_TEST_SUCCESS_RATE = (
TEST_REPO_SELECTOR = 'repo=~"ariadne|metis"'
TEST_CI_COVERAGE = f'ariadne_ci_coverage_percent{{{TEST_REPO_SELECTOR}}}'
TEST_CI_TESTS = f'ariadne_ci_tests_total{{{TEST_REPO_SELECTOR}}}'
TEST_SUCCESS_RATE = (
"100 * "
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result="passed"}}[30d])) '
"/ clamp_min("
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"passed|failed|error"}}[30d])), 1)'
)
ARIADNE_TEST_FAILURES_24H = (
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
TEST_FAILURES_24H = (
f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))'
)
POSTGRES_CONN_USED = (
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
@ -1297,8 +1298,8 @@ def build_overview():
panels.append(
timeseries_panel(
42,
"Ariadne Test Success Rate",
ARIADNE_TEST_SUCCESS_RATE,
"Ariadne + Metis Test Success Rate",
TEST_SUCCESS_RATE,
{"h": 6, "w": 6, "x": 12, "y": 14},
unit="percent",
max_value=100,
@ -1309,8 +1310,8 @@ def build_overview():
panels.append(
bargauge_panel(
43,
"Tests with Failures (24h)",
ARIADNE_TEST_FAILURES_24H,
"Ariadne + Metis Tests with Failures (24h)",
TEST_FAILURES_24H,
{"h": 6, "w": 6, "x": 18, "y": 14},
unit="none",
instant=True,
@ -2656,8 +2657,8 @@ def build_jobs_dashboard():
panels.append(
stat_panel(
17,
"Ariadne CI Coverage (%)",
ARIADNE_CI_COVERAGE,
"Ariadne + Metis CI Coverage (%)",
TEST_CI_COVERAGE,
{"h": 6, "w": 4, "x": 8, "y": 11},
unit="percent",
decimals=1,
@ -2668,8 +2669,8 @@ def build_jobs_dashboard():
panels.append(
table_panel(
18,
"Ariadne CI Tests (latest)",
ARIADNE_CI_TESTS,
"Ariadne + Metis CI Tests (latest)",
TEST_CI_TESTS,
{"h": 6, "w": 12, "x": 12, "y": 11},
unit="none",
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],

View File

@ -10,6 +10,8 @@ spec:
app: node-image-sweeper
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 100%
template:
metadata:
labels:
@ -29,6 +31,21 @@ spec:
- name: node-image-sweeper
image: python:3.12.9-alpine3.20
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
env:
- name: SWEEP_INTERVAL_SEC
value: "21600"
- name: HIGH_USAGE_PERCENT
value: "70"
- name: EMERGENCY_USAGE_PERCENT
value: "80"
- name: BASE_THRESHOLD_DAYS
value: "14"
- name: HIGH_USAGE_THRESHOLD_DAYS
value: "3"
- name: LOG_RETENTION_DAYS
value: "7"
- name: JOURNAL_MAX_SIZE
value: "200M"
securityContext:
privileged: true
runAsUser: 0

View File

@ -2,26 +2,39 @@
set -eu
ONE_SHOT=${ONE_SHOT:-false}
THRESHOLD_DAYS=14
SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then
THRESHOLD_DAYS=3
fi
sweep_once() {
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
threshold_days="${BASE_THRESHOLD_DAYS}"
if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
fi
cutoff=$(python3 - <<'PY'
import time, os
print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400)
cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
import os
import time
days = int(os.environ.get("THRESHOLD_DAYS", "14"))
print(int(time.time()) - days * 86400)
PY
)
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
import json, os, sys, time
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
import json
import os
import sys
import time
try:
data = json.load(sys.stdin)
@ -74,19 +87,33 @@ for p in prune:
PY
)
if [ -n "${prune_list}" ]; then
printf "%s" "${prune_list}" | while read -r image_id; do
if [ -n "${image_id}" ]; then
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
fi
done
fi
if [ -n "${prune_list}" ]; then
printf "%s" "${prune_list}" | while read -r image_id; do
if [ -n "${image_id}" ]; then
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
fi
done
fi
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
# Emergency pass for rootfs pressure on SD-backed nodes.
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
fi
}
sweep_once
if [ "${ONE_SHOT}" = "true" ]; then
exit 0
fi
sleep infinity
while true; do
sleep "${SWEEP_INTERVAL_SEC}"
sweep_once
done

View File

@ -1125,7 +1125,7 @@
{
"id": 17,
"type": "stat",
"title": "Ariadne CI Coverage (%)",
"title": "Ariadne + Metis CI Coverage (%)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1138,7 +1138,7 @@
},
"targets": [
{
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
"expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
"refId": "A",
"legendFormat": "{{branch}}",
"instant": true
@ -1188,7 +1188,7 @@
{
"id": 18,
"type": "table",
"title": "Ariadne CI Tests (latest)",
"title": "Ariadne + Metis CI Tests (latest)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1201,7 +1201,7 @@
},
"targets": [
{
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
"expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
"refId": "A",
"instant": true
}

View File

@ -1677,7 +1677,7 @@
{
"id": 42,
"type": "timeseries",
"title": "Ariadne Test Success Rate",
"title": "Ariadne + Metis Test Success Rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1690,7 +1690,7 @@
},
"targets": [
{
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
"refId": "A"
}
],
@ -1714,7 +1714,7 @@
{
"id": 43,
"type": "bargauge",
"title": "Tests with Failures (24h)",
"title": "Ariadne + Metis Tests with Failures (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1727,7 +1727,7 @@
},
"targets": [
{
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
"refId": "A",
"legendFormat": "{{result}}",
"instant": true

View File

@ -22,7 +22,24 @@ data:
- orgId: 1
receiver: email-admins
group_by:
- grafana_folder
- alertname
group_wait: 1m
group_interval: 30m
repeat_interval: 12h
routes:
- receiver: email-admins
object_matchers:
- [severity, "=", "critical"]
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
- receiver: email-admins
object_matchers:
- [severity, "=", "warning"]
group_wait: 5m
group_interval: 2h
repeat_interval: 24h
rules.yaml: |
apiVersion: 1
groups:
@ -32,7 +49,7 @@ data:
interval: 1m
rules:
- uid: disk-pressure-root
title: "Node rootfs high (>80%)"
title: "Node rootfs high (>85%)"
condition: C
for: "10m"
data:
@ -66,7 +83,7 @@ data:
type: threshold
conditions:
- evaluator:
params: [80]
params: [85]
type: gt
operator:
type: and
@ -76,7 +93,7 @@ data:
noDataState: NoData
execErrState: Error
annotations:
summary: "{{ $labels.node }} rootfs >80% for 10m"
summary: "{{ $labels.node }} rootfs >85% for 10m"
labels:
severity: warning
- uid: disk-growth-1h
@ -501,7 +518,7 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: postmark_outbound_bounce_rate{window="1d"}
expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
legendFormat: bounce 1d
datasource:
type: prometheus
@ -530,7 +547,7 @@ data:
reducer:
type: last
type: query
noDataState: NoData
noDataState: OK
execErrState: Error
annotations:
summary: "Postmark 1d bounce rate >5%"
@ -549,7 +566,7 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: min_over_time(max by (instance) (postmark_api_up)[5m])
expr: max(postmark_api_up) or on() vector(0)
legendFormat: api up
datasource:
type: prometheus
@ -578,7 +595,7 @@ data:
reducer:
type: last
type: query
noDataState: NoData
noDataState: OK
execErrState: Error
annotations:
summary: "Postmark exporter reports API down"

View File

@ -1134,7 +1134,7 @@ data:
{
"id": 17,
"type": "stat",
"title": "Ariadne CI Coverage (%)",
"title": "Ariadne + Metis CI Coverage (%)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1147,7 +1147,7 @@ data:
},
"targets": [
{
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
"expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
"refId": "A",
"legendFormat": "{{branch}}",
"instant": true
@ -1197,7 +1197,7 @@ data:
{
"id": 18,
"type": "table",
"title": "Ariadne CI Tests (latest)",
"title": "Ariadne + Metis CI Tests (latest)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1210,7 +1210,7 @@ data:
},
"targets": [
{
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
"expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
"refId": "A",
"instant": true
}

View File

@ -1686,7 +1686,7 @@ data:
{
"id": 42,
"type": "timeseries",
"title": "Ariadne Test Success Rate",
"title": "Ariadne + Metis Test Success Rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1699,7 +1699,7 @@ data:
},
"targets": [
{
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
"refId": "A"
}
],
@ -1723,7 +1723,7 @@ data:
{
"id": 43,
"type": "bargauge",
"title": "Tests with Failures (24h)",
"title": "Ariadne + Metis Tests with Failures (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1736,7 +1736,7 @@ data:
},
"targets": [
{
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
"refId": "A",
"legendFormat": "{{result}}",
"instant": true

View File

@ -286,7 +286,7 @@ spec:
podAnnotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "monitoring"
monitoring.bstein.dev/restart-rev: "4"
monitoring.bstein.dev/restart-rev: "6"
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }}