diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index c37b2d85..2df777ab 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,6 +26,9 @@ resources: - metis-deployment.yaml - soteria-deployment.yaml - oneoffs/ariadne-migrate-job.yaml + - oneoffs/titan-24-rootfs-sweep-job.yaml + - oneoffs/titan-24-lesavka-desktop-helper-job.yaml + - oneoffs/titan-24-lesavka-desktop-helper-cleanup-job.yaml - ariadne-service.yaml - soteria-service.yaml - disable-k3s-traefik-daemonset.yaml diff --git a/services/maintenance/oneoffs/titan-24-lesavka-desktop-helper-cleanup-job.yaml b/services/maintenance/oneoffs/titan-24-lesavka-desktop-helper-cleanup-job.yaml new file mode 100644 index 00000000..fdabd210 --- /dev/null +++ b/services/maintenance/oneoffs/titan-24-lesavka-desktop-helper-cleanup-job.yaml @@ -0,0 +1,61 @@ +# services/maintenance/oneoffs/titan-24-lesavka-desktop-helper-cleanup-job.yaml +# One-off cleanup for the temporary Lesavka paste-test desktop on titan-24. +# Run by setting spec.suspend to false, reconcile, then set it back to true. +apiVersion: batch/v1 +kind: Job +metadata: + name: titan-24-lesavka-desktop-helper-cleanup + namespace: maintenance + annotations: + kustomize.toolkit.fluxcd.io/force: "true" +spec: + suspend: true + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + metadata: + labels: + app: titan-24-lesavka-desktop-helper-cleanup + spec: + restartPolicy: Never + nodeSelector: + kubernetes.io/hostname: titan-24 + tolerations: + - key: node.kubernetes.io/not-ready + operator: Exists + effect: NoExecute + tolerationSeconds: 300 + - key: node.kubernetes.io/unreachable + operator: Exists + effect: NoExecute + tolerationSeconds: 300 + hostPID: true + containers: + - name: cleanup + image: debian:13-slim + securityContext: + privileged: true + runAsUser: 0 + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + chroot /host /usr/bin/env bash <<'EOS' + set -euo pipefail + username="lesavka-test" + rm -f /etc/sddm.conf.d/60-lesavka-test-autologin.conf + if id "${username}" >/dev/null 2>&1; then + loginctl terminate-user "${username}" >/dev/null 2>&1 || true + userdel -r "${username}" >/dev/null 2>&1 || true + fi + EOS + + nsenter -t 1 -m -u -i -n -p -- systemctl restart sddm || \ + nsenter -t 1 -m -u -i -n -p -- systemctl restart display-manager + volumeMounts: + - name: host-root + mountPath: /host + volumes: + - name: host-root + hostPath: + path: / diff --git a/services/maintenance/oneoffs/titan-24-lesavka-desktop-helper-job.yaml b/services/maintenance/oneoffs/titan-24-lesavka-desktop-helper-job.yaml new file mode 100644 index 00000000..a8c78f2f --- /dev/null +++ b/services/maintenance/oneoffs/titan-24-lesavka-desktop-helper-job.yaml @@ -0,0 +1,118 @@ +# services/maintenance/oneoffs/titan-24-lesavka-desktop-helper-job.yaml +# One-off job to create a temporary autologin desktop for Lesavka paste testing on titan-24. +# Safe to delete the finished Job/pod after it succeeds. +apiVersion: batch/v1 +kind: Job +metadata: + name: titan-24-lesavka-desktop-helper + namespace: maintenance + annotations: + kustomize.toolkit.fluxcd.io/force: "true" +spec: + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + metadata: + labels: + app: titan-24-lesavka-desktop-helper + spec: + restartPolicy: OnFailure + nodeSelector: + kubernetes.io/hostname: titan-24 + tolerations: + - key: node.kubernetes.io/not-ready + operator: Exists + effect: NoExecute + tolerationSeconds: 300 + - key: node.kubernetes.io/unreachable + operator: Exists + effect: NoExecute + tolerationSeconds: 300 + hostPID: true + containers: + - name: setup + image: debian:13-slim + securityContext: + privileged: true + runAsUser: 0 + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + chroot /host /usr/bin/env bash <<'EOS' + set -euo pipefail + username="lesavka-test" + home="/home/${username}" + session_name="plasmax11.desktop" + + if ! id "${username}" >/dev/null 2>&1; then + useradd -m -s /bin/bash "${username}" + fi + passwd -l "${username}" >/dev/null 2>&1 || true + + for group in audio video render input netdev plugdev; do + if getent group "${group}" >/dev/null 2>&1; then + usermod -a -G "${group}" "${username}" + fi + done + + install -d -m 755 /etc/sddm.conf.d + printf '%s\n' \ + '[Autologin]' \ + "User=${username}" \ + "Session=${session_name}" \ + 'Relogin=false' \ + >/etc/sddm.conf.d/60-lesavka-test-autologin.conf + + install -d -o "${username}" -g "${username}" -m 700 \ + "${home}/.config/autostart" \ + "${home}/.local/bin" + install -o "${username}" -g "${username}" -m 644 /dev/null "${home}/lesavka-paste-test.txt" + + printf '%s\n' \ + '#!/usr/bin/env bash' \ + 'set -euo pipefail' \ + 'cd "${HOME}"' \ + 'touch "${HOME}/lesavka-paste-test.txt"' \ + 'if command -v kate >/dev/null 2>&1; then' \ + ' exec kate "${HOME}/lesavka-paste-test.txt"' \ + 'fi' \ + 'if command -v kwrite >/dev/null 2>&1; then' \ + ' exec kwrite "${HOME}/lesavka-paste-test.txt"' \ + 'fi' \ + 'if command -v gedit >/dev/null 2>&1; then' \ + ' exec gedit "${HOME}/lesavka-paste-test.txt"' \ + 'fi' \ + 'if command -v mousepad >/dev/null 2>&1; then' \ + ' exec mousepad "${HOME}/lesavka-paste-test.txt"' \ + 'fi' \ + 'if command -v xterm >/dev/null 2>&1; then' \ + ' exec xterm -fa Monospace -fs 14 -e sh -lc "exec ${EDITOR:-vi} '\''${HOME}/lesavka-paste-test.txt'\''"' \ + 'fi' \ + 'exit 0' \ + >"${home}/.local/bin/lesavka-test-launch.sh" + chmod 755 "${home}/.local/bin/lesavka-test-launch.sh" + + printf '%s\n' \ + '[Desktop Entry]' \ + 'Type=Application' \ + 'Version=1.0' \ + 'Name=Lesavka Paste Test' \ + 'Comment=Open a visible editor for Lesavka clipboard testing' \ + 'Exec=/home/lesavka-test/.local/bin/lesavka-test-launch.sh' \ + 'Terminal=false' \ + 'X-GNOME-Autostart-enabled=true' \ + >"${home}/.config/autostart/lesavka-test.desktop" + + chown -R "${username}:${username}" "${home}/.config" "${home}/.local" "${home}/lesavka-paste-test.txt" + EOS + + nsenter -t 1 -m -u -i -n -p -- systemctl restart sddm || \ + nsenter -t 1 -m -u -i -n -p -- systemctl restart display-manager + volumeMounts: + - name: host-root + mountPath: /host + volumes: + - name: host-root + hostPath: + path: / diff --git a/services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml b/services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml new file mode 100644 index 00000000..9552f110 --- /dev/null +++ b/services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml @@ -0,0 +1,64 @@ +# services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml +# One-off emergency cleanup for titan-24 rootfs pressure. +# Safe to delete the finished Job/pod after it succeeds. +apiVersion: batch/v1 +kind: Job +metadata: + name: titan-24-rootfs-sweep + namespace: maintenance + annotations: + kustomize.toolkit.fluxcd.io/force: "true" +spec: + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + metadata: + labels: + app: titan-24-rootfs-sweep + spec: + restartPolicy: OnFailure + nodeSelector: + kubernetes.io/hostname: titan-24 + tolerations: + - key: node.kubernetes.io/not-ready + operator: Exists + effect: NoExecute + tolerationSeconds: 300 + - key: node.kubernetes.io/unreachable + operator: Exists + effect: NoExecute + tolerationSeconds: 300 + containers: + - name: sweep + image: python:3.12.9-alpine3.20 + command: ["/bin/sh", "/scripts/node_image_sweeper.sh"] + env: + - name: ONE_SHOT + value: "true" + - name: HIGH_USAGE_PERCENT + value: "0" + - name: EMERGENCY_USAGE_PERCENT + value: "0" + - name: LOG_RETENTION_DAYS + value: "1" + - name: ORPHAN_POD_RETENTION_DAYS + value: "0" + - name: JOURNAL_MAX_SIZE + value: "100M" + securityContext: + privileged: true + runAsUser: 0 + volumeMounts: + - name: host-root + mountPath: /host + - name: script + mountPath: /scripts + readOnly: true + volumes: + - name: host-root + hostPath: + path: / + - name: script + configMap: + name: node-image-sweeper-script + defaultMode: 0555 diff --git a/services/maintenance/scripts/node_image_sweeper.sh b/services/maintenance/scripts/node_image_sweeper.sh index 98eedd8f..38aa3a48 100644 --- a/services/maintenance/scripts/node_image_sweeper.sh +++ b/services/maintenance/scripts/node_image_sweeper.sh @@ -51,6 +51,48 @@ for name in hdd_names: PY } +cleanup_orphaned_root_pod_logs() { + if [ ! -d /host/var/log/pods ] || [ ! -d /host/var/lib/kubelet/pods ]; then + return 0 + fi + + ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY' +import os +import shutil +import time + +root_pods = "/host/var/log/pods" +active_pods = "/host/var/lib/kubelet/pods" +retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3")) +cutoff = time.time() - (retention_days * 86400) + +try: + active_names = set(os.listdir(active_pods)) +except Exception: + active_names = set() + +try: + root_names = os.listdir(root_pods) +except Exception: + root_names = [] + +for name in root_names: + path = os.path.join(root_pods, name) + if not os.path.isdir(path): + continue + if name in active_names: + continue + try: + mtime = os.path.getmtime(path) + except Exception: + continue + if mtime > cutoff: + continue + print(path) + shutil.rmtree(path, ignore_errors=True) +PY +} + sweep_once() { usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage="" @@ -61,6 +103,7 @@ sweep_once() { fi cleanup_orphaned_hdd_pod_logs + cleanup_orphaned_root_pod_logs if [ -d /host/var/log.hdd/pods ]; then find /host/var/log.hdd/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true