maintenance(titan-24): add desktop helper and rootfs sweep

This commit is contained in:
Brad Stein 2026-04-15 22:24:24 -03:00
parent fb43b02b2a
commit c55d5ac3b5
5 changed files with 289 additions and 0 deletions

View File

@ -26,6 +26,9 @@ resources:
- metis-deployment.yaml
- soteria-deployment.yaml
- oneoffs/ariadne-migrate-job.yaml
- oneoffs/titan-24-rootfs-sweep-job.yaml
- oneoffs/titan-24-lesavka-desktop-helper-job.yaml
- oneoffs/titan-24-lesavka-desktop-helper-cleanup-job.yaml
- ariadne-service.yaml
- soteria-service.yaml
- disable-k3s-traefik-daemonset.yaml

View File

@ -0,0 +1,61 @@
# services/maintenance/oneoffs/titan-24-lesavka-desktop-helper-cleanup-job.yaml
# One-off cleanup for the temporary Lesavka paste-test desktop on titan-24.
# Run by setting spec.suspend to false, reconcile, then set it back to true.
apiVersion: batch/v1
kind: Job
metadata:
name: titan-24-lesavka-desktop-helper-cleanup
namespace: maintenance
annotations:
kustomize.toolkit.fluxcd.io/force: "true"
spec:
suspend: true
backoffLimit: 1
ttlSecondsAfterFinished: 3600
template:
metadata:
labels:
app: titan-24-lesavka-desktop-helper-cleanup
spec:
restartPolicy: Never
nodeSelector:
kubernetes.io/hostname: titan-24
tolerations:
- key: node.kubernetes.io/not-ready
operator: Exists
effect: NoExecute
tolerationSeconds: 300
- key: node.kubernetes.io/unreachable
operator: Exists
effect: NoExecute
tolerationSeconds: 300
hostPID: true
containers:
- name: cleanup
image: debian:13-slim
securityContext:
privileged: true
runAsUser: 0
command: ["/bin/sh", "-c"]
args:
- |
set -euo pipefail
chroot /host /usr/bin/env bash <<'EOS'
set -euo pipefail
username="lesavka-test"
rm -f /etc/sddm.conf.d/60-lesavka-test-autologin.conf
if id "${username}" >/dev/null 2>&1; then
loginctl terminate-user "${username}" >/dev/null 2>&1 || true
userdel -r "${username}" >/dev/null 2>&1 || true
fi
EOS
nsenter -t 1 -m -u -i -n -p -- systemctl restart sddm || \
nsenter -t 1 -m -u -i -n -p -- systemctl restart display-manager
volumeMounts:
- name: host-root
mountPath: /host
volumes:
- name: host-root
hostPath:
path: /

View File

@ -0,0 +1,118 @@
# services/maintenance/oneoffs/titan-24-lesavka-desktop-helper-job.yaml
# One-off job to create a temporary autologin desktop for Lesavka paste testing on titan-24.
# Safe to delete the finished Job/pod after it succeeds.
apiVersion: batch/v1
kind: Job
metadata:
name: titan-24-lesavka-desktop-helper
namespace: maintenance
annotations:
kustomize.toolkit.fluxcd.io/force: "true"
spec:
backoffLimit: 1
ttlSecondsAfterFinished: 3600
template:
metadata:
labels:
app: titan-24-lesavka-desktop-helper
spec:
restartPolicy: OnFailure
nodeSelector:
kubernetes.io/hostname: titan-24
tolerations:
- key: node.kubernetes.io/not-ready
operator: Exists
effect: NoExecute
tolerationSeconds: 300
- key: node.kubernetes.io/unreachable
operator: Exists
effect: NoExecute
tolerationSeconds: 300
hostPID: true
containers:
- name: setup
image: debian:13-slim
securityContext:
privileged: true
runAsUser: 0
command: ["/bin/sh", "-c"]
args:
- |
set -euo pipefail
chroot /host /usr/bin/env bash <<'EOS'
set -euo pipefail
username="lesavka-test"
home="/home/${username}"
session_name="plasmax11.desktop"
if ! id "${username}" >/dev/null 2>&1; then
useradd -m -s /bin/bash "${username}"
fi
passwd -l "${username}" >/dev/null 2>&1 || true
for group in audio video render input netdev plugdev; do
if getent group "${group}" >/dev/null 2>&1; then
usermod -a -G "${group}" "${username}"
fi
done
install -d -m 755 /etc/sddm.conf.d
printf '%s\n' \
'[Autologin]' \
"User=${username}" \
"Session=${session_name}" \
'Relogin=false' \
>/etc/sddm.conf.d/60-lesavka-test-autologin.conf
install -d -o "${username}" -g "${username}" -m 700 \
"${home}/.config/autostart" \
"${home}/.local/bin"
install -o "${username}" -g "${username}" -m 644 /dev/null "${home}/lesavka-paste-test.txt"
printf '%s\n' \
'#!/usr/bin/env bash' \
'set -euo pipefail' \
'cd "${HOME}"' \
'touch "${HOME}/lesavka-paste-test.txt"' \
'if command -v kate >/dev/null 2>&1; then' \
' exec kate "${HOME}/lesavka-paste-test.txt"' \
'fi' \
'if command -v kwrite >/dev/null 2>&1; then' \
' exec kwrite "${HOME}/lesavka-paste-test.txt"' \
'fi' \
'if command -v gedit >/dev/null 2>&1; then' \
' exec gedit "${HOME}/lesavka-paste-test.txt"' \
'fi' \
'if command -v mousepad >/dev/null 2>&1; then' \
' exec mousepad "${HOME}/lesavka-paste-test.txt"' \
'fi' \
'if command -v xterm >/dev/null 2>&1; then' \
' exec xterm -fa Monospace -fs 14 -e sh -lc "exec ${EDITOR:-vi} '\''${HOME}/lesavka-paste-test.txt'\''"' \
'fi' \
'exit 0' \
>"${home}/.local/bin/lesavka-test-launch.sh"
chmod 755 "${home}/.local/bin/lesavka-test-launch.sh"
printf '%s\n' \
'[Desktop Entry]' \
'Type=Application' \
'Version=1.0' \
'Name=Lesavka Paste Test' \
'Comment=Open a visible editor for Lesavka clipboard testing' \
'Exec=/home/lesavka-test/.local/bin/lesavka-test-launch.sh' \
'Terminal=false' \
'X-GNOME-Autostart-enabled=true' \
>"${home}/.config/autostart/lesavka-test.desktop"
chown -R "${username}:${username}" "${home}/.config" "${home}/.local" "${home}/lesavka-paste-test.txt"
EOS
nsenter -t 1 -m -u -i -n -p -- systemctl restart sddm || \
nsenter -t 1 -m -u -i -n -p -- systemctl restart display-manager
volumeMounts:
- name: host-root
mountPath: /host
volumes:
- name: host-root
hostPath:
path: /

View File

@ -0,0 +1,64 @@
# services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml
# One-off emergency cleanup for titan-24 rootfs pressure.
# Safe to delete the finished Job/pod after it succeeds.
apiVersion: batch/v1
kind: Job
metadata:
name: titan-24-rootfs-sweep
namespace: maintenance
annotations:
kustomize.toolkit.fluxcd.io/force: "true"
spec:
backoffLimit: 1
ttlSecondsAfterFinished: 3600
template:
metadata:
labels:
app: titan-24-rootfs-sweep
spec:
restartPolicy: OnFailure
nodeSelector:
kubernetes.io/hostname: titan-24
tolerations:
- key: node.kubernetes.io/not-ready
operator: Exists
effect: NoExecute
tolerationSeconds: 300
- key: node.kubernetes.io/unreachable
operator: Exists
effect: NoExecute
tolerationSeconds: 300
containers:
- name: sweep
image: python:3.12.9-alpine3.20
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
env:
- name: ONE_SHOT
value: "true"
- name: HIGH_USAGE_PERCENT
value: "0"
- name: EMERGENCY_USAGE_PERCENT
value: "0"
- name: LOG_RETENTION_DAYS
value: "1"
- name: ORPHAN_POD_RETENTION_DAYS
value: "0"
- name: JOURNAL_MAX_SIZE
value: "100M"
securityContext:
privileged: true
runAsUser: 0
volumeMounts:
- name: host-root
mountPath: /host
- name: script
mountPath: /scripts
readOnly: true
volumes:
- name: host-root
hostPath:
path: /
- name: script
configMap:
name: node-image-sweeper-script
defaultMode: 0555

View File

@ -51,6 +51,48 @@ for name in hdd_names:
PY
}
cleanup_orphaned_root_pod_logs() {
if [ ! -d /host/var/log/pods ] || [ ! -d /host/var/lib/kubelet/pods ]; then
return 0
fi
ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY'
import os
import shutil
import time
root_pods = "/host/var/log/pods"
active_pods = "/host/var/lib/kubelet/pods"
retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3"))
cutoff = time.time() - (retention_days * 86400)
try:
active_names = set(os.listdir(active_pods))
except Exception:
active_names = set()
try:
root_names = os.listdir(root_pods)
except Exception:
root_names = []
for name in root_names:
path = os.path.join(root_pods, name)
if not os.path.isdir(path):
continue
if name in active_names:
continue
try:
mtime = os.path.getmtime(path)
except Exception:
continue
if mtime > cutoff:
continue
print(path)
shutil.rmtree(path, ignore_errors=True)
PY
}
sweep_once() {
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
@ -61,6 +103,7 @@ sweep_once() {
fi
cleanup_orphaned_hdd_pod_logs
cleanup_orphaned_root_pod_logs
if [ -d /host/var/log.hdd/pods ]; then
find /host/var/log.hdd/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true