maintenance: run node image sweeper continuously
This commit is contained in:
parent
0aeb08d375
commit
a047942b8e
@ -29,6 +29,21 @@ spec:
|
||||
- name: node-image-sweeper
|
||||
image: python:3.12.9-alpine3.20
|
||||
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
|
||||
env:
|
||||
- name: SWEEP_INTERVAL_SEC
|
||||
value: "21600"
|
||||
- name: HIGH_USAGE_PERCENT
|
||||
value: "70"
|
||||
- name: EMERGENCY_USAGE_PERCENT
|
||||
value: "80"
|
||||
- name: BASE_THRESHOLD_DAYS
|
||||
value: "14"
|
||||
- name: HIGH_USAGE_THRESHOLD_DAYS
|
||||
value: "3"
|
||||
- name: LOG_RETENTION_DAYS
|
||||
value: "7"
|
||||
- name: JOURNAL_MAX_SIZE
|
||||
value: "200M"
|
||||
securityContext:
|
||||
privileged: true
|
||||
runAsUser: 0
|
||||
|
||||
@ -2,26 +2,39 @@
|
||||
set -eu
|
||||
|
||||
ONE_SHOT=${ONE_SHOT:-false}
|
||||
THRESHOLD_DAYS=14
|
||||
SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
|
||||
BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
|
||||
HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
|
||||
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
|
||||
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
|
||||
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
|
||||
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
|
||||
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
|
||||
|
||||
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
|
||||
if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then
|
||||
THRESHOLD_DAYS=3
|
||||
fi
|
||||
sweep_once() {
|
||||
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
|
||||
threshold_days="${BASE_THRESHOLD_DAYS}"
|
||||
if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
|
||||
threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
|
||||
fi
|
||||
|
||||
cutoff=$(python3 - <<'PY'
|
||||
import time, os
|
||||
print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400)
|
||||
cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
|
||||
import os
|
||||
import time
|
||||
|
||||
days = int(os.environ.get("THRESHOLD_DAYS", "14"))
|
||||
print(int(time.time()) - days * 86400)
|
||||
PY
|
||||
)
|
||||
|
||||
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
|
||||
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
|
||||
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
|
||||
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
|
||||
|
||||
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
|
||||
|
||||
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
|
||||
import json, os, sys, time
|
||||
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
@ -74,19 +87,33 @@ for p in prune:
|
||||
PY
|
||||
)
|
||||
|
||||
if [ -n "${prune_list}" ]; then
|
||||
printf "%s" "${prune_list}" | while read -r image_id; do
|
||||
if [ -n "${image_id}" ]; then
|
||||
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
|
||||
fi
|
||||
done
|
||||
fi
|
||||
if [ -n "${prune_list}" ]; then
|
||||
printf "%s" "${prune_list}" | while read -r image_id; do
|
||||
if [ -n "${image_id}" ]; then
|
||||
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
|
||||
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
|
||||
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
|
||||
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
|
||||
|
||||
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
|
||||
# Emergency pass for rootfs pressure on SD-backed nodes.
|
||||
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
|
||||
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
|
||||
fi
|
||||
}
|
||||
|
||||
sweep_once
|
||||
|
||||
if [ "${ONE_SHOT}" = "true" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep infinity
|
||||
while true; do
|
||||
sleep "${SWEEP_INTERVAL_SEC}"
|
||||
sweep_once
|
||||
done
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user