ariadne/ariadne/services/image_sweeper.py

212 lines
6.9 KiB
Python

from __future__ import annotations
import time
from typing import Any
from ..k8s.client import get_json, post_json
from ..utils.logging import get_logger
from ..settings import settings
logger = get_logger(__name__)
_IMAGE_SWEEPER_SCRIPT = """
set -eu
ONE_SHOT=${ONE_SHOT:-false}
THRESHOLD_DAYS=14
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then
THRESHOLD_DAYS=3
fi
cutoff=$(python3 - <<'PY'
import time, os
print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400)
PY
)
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
import json, os, sys, time
try:
data = json.load(sys.stdin)
except Exception:
print("", end="")
sys.exit(0)
cutoff = int(os.environ.get("CUTOFF", "0"))
running = set(os.environ.get("RUNNING", "").split())
skip = os.environ.get("SKIP", "").split()
now = int(time.time())
prune = []
def is_skip(tags):
if not tags:
return False
for t in tags:
for prefix in skip:
if prefix and t.startswith(prefix):
return True
return False
for img in data.get("images", []):
image_id = img.get("id", "")
if not image_id:
continue
if image_id in running:
continue
tags = img.get("repoTags") or []
if is_skip(tags):
continue
created = img.get("createdAt") or 0
try:
created = int(str(created)) // 1000000000
except Exception:
created = 0
if created and created > now:
created = now
if cutoff and created and created < cutoff:
prune.append(image_id)
seen = set()
for p in prune:
if p in seen:
continue
seen.add(p)
print(p)
PY
)
if [ -n "${prune_list}" ]; then
printf "%s" "${prune_list}" | while read -r image_id; do
if [ -n "${image_id}" ]; then
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
fi
done
fi
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
if [ "${ONE_SHOT}" = "true" ]; then
exit 0
fi
sleep infinity
""".strip()
class ImageSweeperService:
"""Create Kubernetes cleanup jobs that prune stale node images."""
def _job_payload(self, job_name: str) -> dict[str, Any]:
job: dict[str, Any] = {
"apiVersion": "batch/v1",
"kind": "Job",
"metadata": {
"name": job_name,
"namespace": settings.image_sweeper_namespace,
"labels": {
"app": "image-sweeper",
"atlas.bstein.dev/trigger": "ariadne",
},
},
"spec": {
"backoffLimit": 0,
"ttlSecondsAfterFinished": settings.image_sweeper_job_ttl_sec,
"template": {
"spec": {
"serviceAccountName": settings.image_sweeper_service_account,
"restartPolicy": "OnFailure",
"nodeSelector": {
"kubernetes.io/os": "linux",
"kubernetes.io/arch": "arm64",
"node-role.kubernetes.io/worker": "true",
},
"tolerations": [
{
"key": "node-role.kubernetes.io/control-plane",
"operator": "Exists",
"effect": "NoSchedule",
},
{
"key": "node-role.kubernetes.io/master",
"operator": "Exists",
"effect": "NoSchedule",
},
],
"containers": [
{
"name": "image-sweeper",
"image": "python:3.12.9-alpine3.20",
"command": ["/bin/sh", "-c"],
"args": [_IMAGE_SWEEPER_SCRIPT],
"env": [
{"name": "ONE_SHOT", "value": "true"},
],
"securityContext": {"privileged": True, "runAsUser": 0},
"volumeMounts": [
{"name": "host-root", "mountPath": "/host"},
],
}
],
"volumes": [
{"name": "host-root", "hostPath": {"path": "/"}},
],
}
},
},
}
return job
def _wait_for_completion(self, job_name: str, timeout_sec: float) -> dict[str, Any]:
deadline = time.time() + timeout_sec
while time.time() < deadline:
job = get_json(
f"/apis/batch/v1/namespaces/{settings.image_sweeper_namespace}/jobs/{job_name}"
)
status = job.get("status") if isinstance(job.get("status"), dict) else {}
if int(status.get("succeeded") or 0) > 0:
return {"job": job_name, "status": "ok"}
if int(status.get("failed") or 0) > 0:
return {"job": job_name, "status": "error"}
time.sleep(2)
return {"job": job_name, "status": "running"}
def run(self, wait: bool = True) -> dict[str, Any]:
job_name = f"image-sweeper-{int(time.time())}"
payload = self._job_payload(job_name)
created = post_json(
f"/apis/batch/v1/namespaces/{settings.image_sweeper_namespace}/jobs",
payload,
)
name = created.get("metadata", {}).get("name", job_name)
logger.info(
"image sweeper job triggered",
extra={"event": "image_sweeper_trigger", "job": name},
)
if wait:
result = self._wait_for_completion(name, settings.image_sweeper_wait_timeout_sec)
status = result.get("status")
if status != "ok":
logger.error(
"image sweeper job incomplete",
extra={"event": "image_sweeper_incomplete", "job": name, "status": status},
)
raise RuntimeError(f"image sweeper job {name} {status}")
return result
return {"job": name, "status": "queued"}
image_sweeper = ImageSweeperService()