From 88f0cd7f1dad8a5e4f6de3dcf805ba5c72bbe456 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 30 Mar 2026 18:36:53 -0300 Subject: [PATCH] maintenance: run node image sweeper continuously --- .../node-image-sweeper-daemonset.yaml | 15 ++++ .../maintenance/scripts/node_image_sweeper.sh | 75 +++++++++++++------ 2 files changed, 66 insertions(+), 24 deletions(-) diff --git a/services/maintenance/node-image-sweeper-daemonset.yaml b/services/maintenance/node-image-sweeper-daemonset.yaml index c3cb24d7..4451432b 100644 --- a/services/maintenance/node-image-sweeper-daemonset.yaml +++ b/services/maintenance/node-image-sweeper-daemonset.yaml @@ -29,6 +29,21 @@ spec: - name: node-image-sweeper image: python:3.12.9-alpine3.20 command: ["/bin/sh", "/scripts/node_image_sweeper.sh"] + env: + - name: SWEEP_INTERVAL_SEC + value: "21600" + - name: HIGH_USAGE_PERCENT + value: "70" + - name: EMERGENCY_USAGE_PERCENT + value: "80" + - name: BASE_THRESHOLD_DAYS + value: "14" + - name: HIGH_USAGE_THRESHOLD_DAYS + value: "3" + - name: LOG_RETENTION_DAYS + value: "7" + - name: JOURNAL_MAX_SIZE + value: "200M" securityContext: privileged: true runAsUser: 0 diff --git a/services/maintenance/scripts/node_image_sweeper.sh b/services/maintenance/scripts/node_image_sweeper.sh index 2ad7b47d..c2fb6da1 100644 --- a/services/maintenance/scripts/node_image_sweeper.sh +++ b/services/maintenance/scripts/node_image_sweeper.sh @@ -2,26 +2,39 @@ set -eu ONE_SHOT=${ONE_SHOT:-false} -THRESHOLD_DAYS=14 +SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600} +BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14} +HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3} +HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70} +EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85} +LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7} +JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M} +SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause" -usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage="" -if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then - THRESHOLD_DAYS=3 -fi +sweep_once() { + usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage="" + threshold_days="${BASE_THRESHOLD_DAYS}" + if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then + threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}" + fi -cutoff=$(python3 - <<'PY' -import time, os -print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400) + cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY' +import os +import time + +days = int(os.environ.get("THRESHOLD_DAYS", "14")) +print(int(time.time()) - days * 86400) PY ) -RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ') -IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}') + RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ') + IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}') -SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause" - -prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY' -import json, os, sys, time + prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY' +import json +import os +import sys +import time try: data = json.load(sys.stdin) @@ -74,19 +87,33 @@ for p in prune: PY ) -if [ -n "${prune_list}" ]; then - printf "%s" "${prune_list}" | while read -r image_id; do - if [ -n "${image_id}" ]; then - chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true - fi - done -fi + if [ -n "${prune_list}" ]; then + printf "%s" "${prune_list}" | while read -r image_id; do + if [ -n "${image_id}" ]; then + chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true + fi + done + fi -find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true -find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true + find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true + find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true + + if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then + # Emergency pass for rootfs pressure on SD-backed nodes. + chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true" + find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true + find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true + chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi" + fi +} + +sweep_once if [ "${ONE_SHOT}" = "true" ]; then exit 0 fi -sleep infinity +while true; do + sleep "${SWEEP_INTERVAL_SEC}" + sweep_once +done