titan-iac/services/maintenance/scripts/node_image_sweeper.sh

140 lines
4.0 KiB
Bash
Raw Normal View History

2026-01-13 09:59:39 -03:00
#!/bin/sh
set -eu
ONE_SHOT=${ONE_SHOT:-false}
SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
ORPHAN_POD_RETENTION_DAYS=${ORPHAN_POD_RETENTION_DAYS:-3}
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
2026-01-13 09:59:39 -03:00
cleanup_orphaned_hdd_pod_logs() {
if [ ! -d /host/var/log.hdd/pods ]; then
return 0
fi
2026-01-13 09:59:39 -03:00
ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY'
import os
import shutil
import time
hdd_pods = "/host/var/log.hdd/pods"
active_pods = "/host/var/log/pods"
retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3"))
cutoff = time.time() - (retention_days * 86400)
2026-01-13 09:59:39 -03:00
try:
active_names = set(os.listdir(active_pods))
2026-01-13 09:59:39 -03:00
except Exception:
active_names = set()
2026-01-13 09:59:39 -03:00
try:
hdd_names = os.listdir(hdd_pods)
except Exception:
hdd_names = []
2026-01-13 09:59:39 -03:00
for name in hdd_names:
path = os.path.join(hdd_pods, name)
if not os.path.isdir(path):
2026-01-13 09:59:39 -03:00
continue
if name in active_names:
2026-01-13 09:59:39 -03:00
continue
try:
mtime = os.path.getmtime(path)
2026-01-13 09:59:39 -03:00
except Exception:
continue
if mtime > cutoff:
continue
print(path)
shutil.rmtree(path, ignore_errors=True)
2026-01-13 09:59:39 -03:00
PY
}
cleanup_orphaned_root_pod_logs() {
if [ ! -d /host/var/log/pods ] || [ ! -d /host/var/lib/kubelet/pods ]; then
return 0
fi
ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY'
import os
import shutil
import time
root_pods = "/host/var/log/pods"
active_pods = "/host/var/lib/kubelet/pods"
retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3"))
cutoff = time.time() - (retention_days * 86400)
try:
active_names = set(os.listdir(active_pods))
except Exception:
active_names = set()
try:
root_names = os.listdir(root_pods)
except Exception:
root_names = []
for name in root_names:
path = os.path.join(root_pods, name)
if not os.path.isdir(path):
continue
if name in active_names:
continue
try:
mtime = os.path.getmtime(path)
except Exception:
continue
if mtime > cutoff:
continue
print(path)
shutil.rmtree(path, ignore_errors=True)
PY
}
sweep_once() {
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
# crictl image metadata frequently omits createdAt on this cluster; prune by
# runtime reachability whenever rootfs crosses pressure thresholds.
if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
fi
cleanup_orphaned_hdd_pod_logs
cleanup_orphaned_root_pod_logs
if [ -d /host/var/log.hdd/pods ]; then
find /host/var/log.hdd/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
fi
if [ -d /host/var/log.hdd/containers ]; then
find /host/var/log.hdd/containers -xtype l -print -delete 2>/dev/null || true
fi
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
# Emergency pass for rootfs pressure on SD-backed nodes.
chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
find /host/var/log.hdd -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
fi
}
2026-01-13 09:59:39 -03:00
sweep_once
2026-01-13 09:59:39 -03:00
if [ "${ONE_SHOT}" = "true" ]; then
exit 0
fi
while true; do
sleep "${SWEEP_INTERVAL_SEC}"
sweep_once
done