Compare commits

..

No commits in common. "6c3c1342cdc09e1ef7c9e3609022372e7e22972f" and "e93aa6e33b7544dd71eec1e7ad92ca1ce0f36161" have entirely different histories.

9 changed files with 63 additions and 125 deletions

View File

@ -423,17 +423,16 @@ ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
)
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
TEST_REPO_SELECTOR = 'repo=~"ariadne|metis"'
TEST_CI_COVERAGE = f'ariadne_ci_coverage_percent{{{TEST_REPO_SELECTOR}}}'
TEST_CI_TESTS = f'ariadne_ci_tests_total{{{TEST_REPO_SELECTOR}}}'
TEST_SUCCESS_RATE = (
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
ARIADNE_TEST_SUCCESS_RATE = (
"100 * "
f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result="passed"}}[30d])) '
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
"/ clamp_min("
f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"passed|failed|error"}}[30d])), 1)'
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
)
TEST_FAILURES_24H = (
f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))'
ARIADNE_TEST_FAILURES_24H = (
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
)
POSTGRES_CONN_USED = (
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
@ -1298,8 +1297,8 @@ def build_overview():
panels.append(
timeseries_panel(
42,
"Ariadne + Metis Test Success Rate",
TEST_SUCCESS_RATE,
"Ariadne Test Success Rate",
ARIADNE_TEST_SUCCESS_RATE,
{"h": 6, "w": 6, "x": 12, "y": 14},
unit="percent",
max_value=100,
@ -1310,8 +1309,8 @@ def build_overview():
panels.append(
bargauge_panel(
43,
"Ariadne + Metis Tests with Failures (24h)",
TEST_FAILURES_24H,
"Tests with Failures (24h)",
ARIADNE_TEST_FAILURES_24H,
{"h": 6, "w": 6, "x": 18, "y": 14},
unit="none",
instant=True,
@ -2657,8 +2656,8 @@ def build_jobs_dashboard():
panels.append(
stat_panel(
17,
"Ariadne + Metis CI Coverage (%)",
TEST_CI_COVERAGE,
"Ariadne CI Coverage (%)",
ARIADNE_CI_COVERAGE,
{"h": 6, "w": 4, "x": 8, "y": 11},
unit="percent",
decimals=1,
@ -2669,8 +2668,8 @@ def build_jobs_dashboard():
panels.append(
table_panel(
18,
"Ariadne + Metis CI Tests (latest)",
TEST_CI_TESTS,
"Ariadne CI Tests (latest)",
ARIADNE_CI_TESTS,
{"h": 6, "w": 12, "x": 12, "y": 11},
unit="none",
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],

View File

@ -10,8 +10,6 @@ spec:
app: node-image-sweeper
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 100%
template:
metadata:
labels:
@ -31,21 +29,6 @@ spec:
- name: node-image-sweeper
image: python:3.12.9-alpine3.20
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
env:
- name: SWEEP_INTERVAL_SEC
value: "21600"
- name: HIGH_USAGE_PERCENT
value: "70"
- name: EMERGENCY_USAGE_PERCENT
value: "80"
- name: BASE_THRESHOLD_DAYS
value: "14"
- name: HIGH_USAGE_THRESHOLD_DAYS
value: "3"
- name: LOG_RETENTION_DAYS
value: "7"
- name: JOURNAL_MAX_SIZE
value: "200M"
securityContext:
privileged: true
runAsUser: 0

View File

@ -2,39 +2,26 @@
set -eu
ONE_SHOT=${ONE_SHOT:-false}
SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
THRESHOLD_DAYS=14
sweep_once() {
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
threshold_days="${BASE_THRESHOLD_DAYS}"
if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
fi
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then
THRESHOLD_DAYS=3
fi
cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
import os
import time
days = int(os.environ.get("THRESHOLD_DAYS", "14"))
print(int(time.time()) - days * 86400)
cutoff=$(python3 - <<'PY'
import time, os
print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400)
PY
)
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
import json
import os
import sys
import time
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
import json, os, sys, time
try:
data = json.load(sys.stdin)
@ -87,33 +74,19 @@ for p in prune:
PY
)
if [ -n "${prune_list}" ]; then
if [ -n "${prune_list}" ]; then
printf "%s" "${prune_list}" | while read -r image_id; do
if [ -n "${image_id}" ]; then
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
fi
done
fi
fi
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
# Emergency pass for rootfs pressure on SD-backed nodes.
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
fi
}
sweep_once
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
if [ "${ONE_SHOT}" = "true" ]; then
exit 0
fi
while true; do
sleep "${SWEEP_INTERVAL_SEC}"
sweep_once
done
sleep infinity

View File

@ -1125,7 +1125,7 @@
{
"id": 17,
"type": "stat",
"title": "Ariadne + Metis CI Coverage (%)",
"title": "Ariadne CI Coverage (%)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1138,7 +1138,7 @@
},
"targets": [
{
"expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
"refId": "A",
"legendFormat": "{{branch}}",
"instant": true
@ -1188,7 +1188,7 @@
{
"id": 18,
"type": "table",
"title": "Ariadne + Metis CI Tests (latest)",
"title": "Ariadne CI Tests (latest)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1201,7 +1201,7 @@
},
"targets": [
{
"expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
"refId": "A",
"instant": true
}

View File

@ -1677,7 +1677,7 @@
{
"id": 42,
"type": "timeseries",
"title": "Ariadne + Metis Test Success Rate",
"title": "Ariadne Test Success Rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1690,7 +1690,7 @@
},
"targets": [
{
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
"refId": "A"
}
],
@ -1714,7 +1714,7 @@
{
"id": 43,
"type": "bargauge",
"title": "Ariadne + Metis Tests with Failures (24h)",
"title": "Tests with Failures (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1727,7 +1727,7 @@
},
"targets": [
{
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
"refId": "A",
"legendFormat": "{{result}}",
"instant": true

View File

@ -22,24 +22,7 @@ data:
- orgId: 1
receiver: email-admins
group_by:
- grafana_folder
- alertname
group_wait: 1m
group_interval: 30m
repeat_interval: 12h
routes:
- receiver: email-admins
object_matchers:
- [severity, "=", "critical"]
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
- receiver: email-admins
object_matchers:
- [severity, "=", "warning"]
group_wait: 5m
group_interval: 2h
repeat_interval: 24h
rules.yaml: |
apiVersion: 1
groups:
@ -49,7 +32,7 @@ data:
interval: 1m
rules:
- uid: disk-pressure-root
title: "Node rootfs high (>85%)"
title: "Node rootfs high (>80%)"
condition: C
for: "10m"
data:
@ -83,7 +66,7 @@ data:
type: threshold
conditions:
- evaluator:
params: [85]
params: [80]
type: gt
operator:
type: and
@ -93,7 +76,7 @@ data:
noDataState: NoData
execErrState: Error
annotations:
summary: "{{ $labels.node }} rootfs >85% for 10m"
summary: "{{ $labels.node }} rootfs >80% for 10m"
labels:
severity: warning
- uid: disk-growth-1h
@ -518,7 +501,7 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
expr: postmark_outbound_bounce_rate{window="1d"}
legendFormat: bounce 1d
datasource:
type: prometheus
@ -547,7 +530,7 @@ data:
reducer:
type: last
type: query
noDataState: OK
noDataState: NoData
execErrState: Error
annotations:
summary: "Postmark 1d bounce rate >5%"
@ -566,7 +549,7 @@ data:
model:
intervalMs: 60000
maxDataPoints: 43200
expr: max(postmark_api_up) or on() vector(0)
expr: min_over_time(max by (instance) (postmark_api_up)[5m])
legendFormat: api up
datasource:
type: prometheus
@ -595,7 +578,7 @@ data:
reducer:
type: last
type: query
noDataState: OK
noDataState: NoData
execErrState: Error
annotations:
summary: "Postmark exporter reports API down"

View File

@ -1134,7 +1134,7 @@ data:
{
"id": 17,
"type": "stat",
"title": "Ariadne + Metis CI Coverage (%)",
"title": "Ariadne CI Coverage (%)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1147,7 +1147,7 @@ data:
},
"targets": [
{
"expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
"refId": "A",
"legendFormat": "{{branch}}",
"instant": true
@ -1197,7 +1197,7 @@ data:
{
"id": 18,
"type": "table",
"title": "Ariadne + Metis CI Tests (latest)",
"title": "Ariadne CI Tests (latest)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1210,7 +1210,7 @@ data:
},
"targets": [
{
"expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
"refId": "A",
"instant": true
}

View File

@ -1686,7 +1686,7 @@ data:
{
"id": 42,
"type": "timeseries",
"title": "Ariadne + Metis Test Success Rate",
"title": "Ariadne Test Success Rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1699,7 +1699,7 @@ data:
},
"targets": [
{
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
"refId": "A"
}
],
@ -1723,7 +1723,7 @@ data:
{
"id": 43,
"type": "bargauge",
"title": "Ariadne + Metis Tests with Failures (24h)",
"title": "Tests with Failures (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1736,7 +1736,7 @@ data:
},
"targets": [
{
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
"refId": "A",
"legendFormat": "{{result}}",
"instant": true

View File

@ -286,7 +286,7 @@ spec:
podAnnotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "monitoring"
monitoring.bstein.dev/restart-rev: "6"
monitoring.bstein.dev/restart-rev: "4"
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }}