#!/bin/sh set -eu ONE_SHOT=${ONE_SHOT:-false} SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600} HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70} EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85} LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7} ORPHAN_POD_RETENTION_DAYS=${ORPHAN_POD_RETENTION_DAYS:-3} JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M} cleanup_orphaned_hdd_pod_logs() { if [ ! -d /host/var/log.hdd/pods ]; then return 0 fi ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY' import os import shutil import time hdd_pods = "/host/var/log.hdd/pods" active_pods = "/host/var/log/pods" retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3")) cutoff = time.time() - (retention_days * 86400) try: active_names = set(os.listdir(active_pods)) except Exception: active_names = set() try: hdd_names = os.listdir(hdd_pods) except Exception: hdd_names = [] for name in hdd_names: path = os.path.join(hdd_pods, name) if not os.path.isdir(path): continue if name in active_names: continue try: mtime = os.path.getmtime(path) except Exception: continue if mtime > cutoff: continue print(path) shutil.rmtree(path, ignore_errors=True) PY } cleanup_orphaned_root_pod_logs() { if [ ! -d /host/var/log/pods ] || [ ! -d /host/var/lib/kubelet/pods ]; then return 0 fi ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY' import os import shutil import time root_pods = "/host/var/log/pods" active_pods = "/host/var/lib/kubelet/pods" retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3")) cutoff = time.time() - (retention_days * 86400) try: active_names = set(os.listdir(active_pods)) except Exception: active_names = set() try: root_names = os.listdir(root_pods) except Exception: root_names = [] for name in root_names: path = os.path.join(root_pods, name) if not os.path.isdir(path): continue if name in active_names: continue try: mtime = os.path.getmtime(path) except Exception: continue if mtime > cutoff: continue print(path) shutil.rmtree(path, ignore_errors=True) PY } sweep_once() { usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage="" # crictl image metadata frequently omits createdAt on this cluster; prune by # runtime reachability whenever rootfs crosses pressure thresholds. if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true" fi cleanup_orphaned_hdd_pod_logs cleanup_orphaned_root_pod_logs if [ -d /host/var/log.hdd/pods ]; then find /host/var/log.hdd/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true fi if [ -d /host/var/log.hdd/containers ]; then find /host/var/log.hdd/containers -xtype l -print -delete 2>/dev/null || true fi find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then # Emergency pass for rootfs pressure on SD-backed nodes. chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true" chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true" find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true find /host/var/log.hdd -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi" fi } sweep_once if [ "${ONE_SHOT}" = "true" ]; then exit 0 fi while true; do sleep "${SWEEP_INTERVAL_SEC}" sweep_once done