from __future__ import annotations import time from typing import Any from ..k8s.client import get_json, post_json from ..utils.logging import get_logger from ..settings import settings logger = get_logger(__name__) _IMAGE_SWEEPER_SCRIPT = """ set -eu ONE_SHOT=${ONE_SHOT:-false} THRESHOLD_DAYS=14 usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage="" if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then THRESHOLD_DAYS=3 fi cutoff=$(python3 - <<'PY' import time, os print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400) PY ) RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ') IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}') SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause" prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY' import json, os, sys, time try: data = json.load(sys.stdin) except Exception: print("", end="") sys.exit(0) cutoff = int(os.environ.get("CUTOFF", "0")) running = set(os.environ.get("RUNNING", "").split()) skip = os.environ.get("SKIP", "").split() now = int(time.time()) prune = [] def is_skip(tags): if not tags: return False for t in tags: for prefix in skip: if prefix and t.startswith(prefix): return True return False for img in data.get("images", []): image_id = img.get("id", "") if not image_id: continue if image_id in running: continue tags = img.get("repoTags") or [] if is_skip(tags): continue created = img.get("createdAt") or 0 try: created = int(str(created)) // 1000000000 except Exception: created = 0 if created and created > now: created = now if cutoff and created and created < cutoff: prune.append(image_id) seen = set() for p in prune: if p in seen: continue seen.add(p) print(p) PY ) if [ -n "${prune_list}" ]; then printf "%s" "${prune_list}" | while read -r image_id; do if [ -n "${image_id}" ]; then chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true fi done fi find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true if [ "${ONE_SHOT}" = "true" ]; then exit 0 fi sleep infinity """.strip() class ImageSweeperService: """Create Kubernetes cleanup jobs that prune stale node images.""" def _job_payload(self, job_name: str) -> dict[str, Any]: job: dict[str, Any] = { "apiVersion": "batch/v1", "kind": "Job", "metadata": { "name": job_name, "namespace": settings.image_sweeper_namespace, "labels": { "app": "image-sweeper", "atlas.bstein.dev/trigger": "ariadne", }, }, "spec": { "backoffLimit": 0, "ttlSecondsAfterFinished": settings.image_sweeper_job_ttl_sec, "template": { "spec": { "serviceAccountName": settings.image_sweeper_service_account, "restartPolicy": "OnFailure", "nodeSelector": { "kubernetes.io/os": "linux", "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true", }, "tolerations": [ { "key": "node-role.kubernetes.io/control-plane", "operator": "Exists", "effect": "NoSchedule", }, { "key": "node-role.kubernetes.io/master", "operator": "Exists", "effect": "NoSchedule", }, ], "containers": [ { "name": "image-sweeper", "image": "python:3.12.9-alpine3.20", "command": ["/bin/sh", "-c"], "args": [_IMAGE_SWEEPER_SCRIPT], "env": [ {"name": "ONE_SHOT", "value": "true"}, ], "securityContext": {"privileged": True, "runAsUser": 0}, "volumeMounts": [ {"name": "host-root", "mountPath": "/host"}, ], } ], "volumes": [ {"name": "host-root", "hostPath": {"path": "/"}}, ], } }, }, } return job def _wait_for_completion(self, job_name: str, timeout_sec: float) -> dict[str, Any]: deadline = time.time() + timeout_sec while time.time() < deadline: job = get_json( f"/apis/batch/v1/namespaces/{settings.image_sweeper_namespace}/jobs/{job_name}" ) status = job.get("status") if isinstance(job.get("status"), dict) else {} if int(status.get("succeeded") or 0) > 0: return {"job": job_name, "status": "ok"} if int(status.get("failed") or 0) > 0: return {"job": job_name, "status": "error"} time.sleep(2) return {"job": job_name, "status": "running"} def run(self, wait: bool = True) -> dict[str, Any]: job_name = f"image-sweeper-{int(time.time())}" payload = self._job_payload(job_name) created = post_json( f"/apis/batch/v1/namespaces/{settings.image_sweeper_namespace}/jobs", payload, ) name = created.get("metadata", {}).get("name", job_name) logger.info( "image sweeper job triggered", extra={"event": "image_sweeper_trigger", "job": name}, ) if wait: result = self._wait_for_completion(name, settings.image_sweeper_wait_timeout_sec) status = result.get("status") if status != "ok": logger.error( "image sweeper job incomplete", extra={"event": "image_sweeper_incomplete", "job": name, "status": status}, ) raise RuntimeError(f"image sweeper job {name} {status}") return result return {"job": name, "status": "queued"} image_sweeper = ImageSweeperService()