212 lines
6.9 KiB
Python
212 lines
6.9 KiB
Python
from __future__ import annotations
|
|
|
|
import time
|
|
from typing import Any
|
|
|
|
from ..k8s.client import get_json, post_json
|
|
from ..utils.logging import get_logger
|
|
from ..settings import settings
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
_IMAGE_SWEEPER_SCRIPT = """
|
|
set -eu
|
|
|
|
ONE_SHOT=${ONE_SHOT:-false}
|
|
THRESHOLD_DAYS=14
|
|
|
|
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
|
|
if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then
|
|
THRESHOLD_DAYS=3
|
|
fi
|
|
|
|
cutoff=$(python3 - <<'PY'
|
|
import time, os
|
|
print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400)
|
|
PY
|
|
)
|
|
|
|
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
|
|
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
|
|
|
|
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
|
|
|
|
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
|
|
import json, os, sys, time
|
|
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
except Exception:
|
|
print("", end="")
|
|
sys.exit(0)
|
|
|
|
cutoff = int(os.environ.get("CUTOFF", "0"))
|
|
running = set(os.environ.get("RUNNING", "").split())
|
|
skip = os.environ.get("SKIP", "").split()
|
|
now = int(time.time())
|
|
prune = []
|
|
|
|
|
|
def is_skip(tags):
|
|
if not tags:
|
|
return False
|
|
for t in tags:
|
|
for prefix in skip:
|
|
if prefix and t.startswith(prefix):
|
|
return True
|
|
return False
|
|
|
|
|
|
for img in data.get("images", []):
|
|
image_id = img.get("id", "")
|
|
if not image_id:
|
|
continue
|
|
if image_id in running:
|
|
continue
|
|
tags = img.get("repoTags") or []
|
|
if is_skip(tags):
|
|
continue
|
|
created = img.get("createdAt") or 0
|
|
try:
|
|
created = int(str(created)) // 1000000000
|
|
except Exception:
|
|
created = 0
|
|
if created and created > now:
|
|
created = now
|
|
if cutoff and created and created < cutoff:
|
|
prune.append(image_id)
|
|
|
|
seen = set()
|
|
for p in prune:
|
|
if p in seen:
|
|
continue
|
|
seen.add(p)
|
|
print(p)
|
|
PY
|
|
)
|
|
|
|
if [ -n "${prune_list}" ]; then
|
|
printf "%s" "${prune_list}" | while read -r image_id; do
|
|
if [ -n "${image_id}" ]; then
|
|
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
|
|
fi
|
|
done
|
|
fi
|
|
|
|
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
|
|
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
|
|
|
|
if [ "${ONE_SHOT}" = "true" ]; then
|
|
exit 0
|
|
fi
|
|
|
|
sleep infinity
|
|
""".strip()
|
|
|
|
|
|
class ImageSweeperService:
|
|
"""Create Kubernetes cleanup jobs that prune stale node images."""
|
|
|
|
def _job_payload(self, job_name: str) -> dict[str, Any]:
|
|
job: dict[str, Any] = {
|
|
"apiVersion": "batch/v1",
|
|
"kind": "Job",
|
|
"metadata": {
|
|
"name": job_name,
|
|
"namespace": settings.image_sweeper_namespace,
|
|
"labels": {
|
|
"app": "image-sweeper",
|
|
"atlas.bstein.dev/trigger": "ariadne",
|
|
},
|
|
},
|
|
"spec": {
|
|
"backoffLimit": 0,
|
|
"ttlSecondsAfterFinished": settings.image_sweeper_job_ttl_sec,
|
|
"template": {
|
|
"spec": {
|
|
"serviceAccountName": settings.image_sweeper_service_account,
|
|
"restartPolicy": "OnFailure",
|
|
"nodeSelector": {
|
|
"kubernetes.io/os": "linux",
|
|
"kubernetes.io/arch": "arm64",
|
|
"node-role.kubernetes.io/worker": "true",
|
|
},
|
|
"tolerations": [
|
|
{
|
|
"key": "node-role.kubernetes.io/control-plane",
|
|
"operator": "Exists",
|
|
"effect": "NoSchedule",
|
|
},
|
|
{
|
|
"key": "node-role.kubernetes.io/master",
|
|
"operator": "Exists",
|
|
"effect": "NoSchedule",
|
|
},
|
|
],
|
|
"containers": [
|
|
{
|
|
"name": "image-sweeper",
|
|
"image": "python:3.12.9-alpine3.20",
|
|
"command": ["/bin/sh", "-c"],
|
|
"args": [_IMAGE_SWEEPER_SCRIPT],
|
|
"env": [
|
|
{"name": "ONE_SHOT", "value": "true"},
|
|
],
|
|
"securityContext": {"privileged": True, "runAsUser": 0},
|
|
"volumeMounts": [
|
|
{"name": "host-root", "mountPath": "/host"},
|
|
],
|
|
}
|
|
],
|
|
"volumes": [
|
|
{"name": "host-root", "hostPath": {"path": "/"}},
|
|
],
|
|
}
|
|
},
|
|
},
|
|
}
|
|
return job
|
|
|
|
def _wait_for_completion(self, job_name: str, timeout_sec: float) -> dict[str, Any]:
|
|
deadline = time.time() + timeout_sec
|
|
while time.time() < deadline:
|
|
job = get_json(
|
|
f"/apis/batch/v1/namespaces/{settings.image_sweeper_namespace}/jobs/{job_name}"
|
|
)
|
|
status = job.get("status") if isinstance(job.get("status"), dict) else {}
|
|
if int(status.get("succeeded") or 0) > 0:
|
|
return {"job": job_name, "status": "ok"}
|
|
if int(status.get("failed") or 0) > 0:
|
|
return {"job": job_name, "status": "error"}
|
|
time.sleep(2)
|
|
return {"job": job_name, "status": "running"}
|
|
|
|
def run(self, wait: bool = True) -> dict[str, Any]:
|
|
job_name = f"image-sweeper-{int(time.time())}"
|
|
payload = self._job_payload(job_name)
|
|
created = post_json(
|
|
f"/apis/batch/v1/namespaces/{settings.image_sweeper_namespace}/jobs",
|
|
payload,
|
|
)
|
|
name = created.get("metadata", {}).get("name", job_name)
|
|
logger.info(
|
|
"image sweeper job triggered",
|
|
extra={"event": "image_sweeper_trigger", "job": name},
|
|
)
|
|
if wait:
|
|
result = self._wait_for_completion(name, settings.image_sweeper_wait_timeout_sec)
|
|
status = result.get("status")
|
|
if status != "ok":
|
|
logger.error(
|
|
"image sweeper job incomplete",
|
|
extra={"event": "image_sweeper_incomplete", "job": name, "status": status},
|
|
)
|
|
raise RuntimeError(f"image sweeper job {name} {status}")
|
|
return result
|
|
return {"job": name, "status": "queued"}
|
|
|
|
|
|
image_sweeper = ImageSweeperService()
|