97 lines
3.1 KiB
Bash
97 lines
3.1 KiB
Bash
#!/bin/sh
|
|
set -eu
|
|
|
|
ONE_SHOT=${ONE_SHOT:-false}
|
|
SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
|
|
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
|
|
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
|
|
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
|
|
ORPHAN_POD_RETENTION_DAYS=${ORPHAN_POD_RETENTION_DAYS:-3}
|
|
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
|
|
|
|
cleanup_orphaned_hdd_pod_logs() {
|
|
if [ ! -d /host/var/log.hdd/pods ]; then
|
|
return 0
|
|
fi
|
|
|
|
ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY'
|
|
import os
|
|
import shutil
|
|
import time
|
|
|
|
hdd_pods = "/host/var/log.hdd/pods"
|
|
active_pods = "/host/var/log/pods"
|
|
retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3"))
|
|
cutoff = time.time() - (retention_days * 86400)
|
|
|
|
try:
|
|
active_names = set(os.listdir(active_pods))
|
|
except Exception:
|
|
active_names = set()
|
|
|
|
try:
|
|
hdd_names = os.listdir(hdd_pods)
|
|
except Exception:
|
|
hdd_names = []
|
|
|
|
for name in hdd_names:
|
|
path = os.path.join(hdd_pods, name)
|
|
if not os.path.isdir(path):
|
|
continue
|
|
if name in active_names:
|
|
continue
|
|
try:
|
|
mtime = os.path.getmtime(path)
|
|
except Exception:
|
|
continue
|
|
if mtime > cutoff:
|
|
continue
|
|
print(path)
|
|
shutil.rmtree(path, ignore_errors=True)
|
|
PY
|
|
}
|
|
|
|
sweep_once() {
|
|
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
|
|
|
|
# crictl image metadata frequently omits createdAt on this cluster; prune by
|
|
# runtime reachability whenever rootfs crosses pressure thresholds.
|
|
if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
|
|
chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
|
|
fi
|
|
|
|
cleanup_orphaned_hdd_pod_logs
|
|
|
|
if [ -d /host/var/log.hdd/pods ]; then
|
|
find /host/var/log.hdd/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
|
fi
|
|
|
|
if [ -d /host/var/log.hdd/containers ]; then
|
|
find /host/var/log.hdd/containers -xtype l -print -delete 2>/dev/null || true
|
|
fi
|
|
|
|
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
|
|
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
|
|
|
|
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
|
|
# Emergency pass for rootfs pressure on SD-backed nodes.
|
|
chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
|
|
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
|
|
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
|
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
|
find /host/var/log.hdd -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
|
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
|
|
fi
|
|
}
|
|
|
|
sweep_once
|
|
|
|
if [ "${ONE_SHOT}" = "true" ]; then
|
|
exit 0
|
|
fi
|
|
|
|
while true; do
|
|
sleep "${SWEEP_INTERVAL_SEC}"
|
|
sweep_once
|
|
done
|