maintenance(titan-24): add desktop helper and rootfs sweep
This commit is contained in:
parent
fb43b02b2a
commit
c55d5ac3b5
@ -26,6 +26,9 @@ resources:
|
||||
- metis-deployment.yaml
|
||||
- soteria-deployment.yaml
|
||||
- oneoffs/ariadne-migrate-job.yaml
|
||||
- oneoffs/titan-24-rootfs-sweep-job.yaml
|
||||
- oneoffs/titan-24-lesavka-desktop-helper-job.yaml
|
||||
- oneoffs/titan-24-lesavka-desktop-helper-cleanup-job.yaml
|
||||
- ariadne-service.yaml
|
||||
- soteria-service.yaml
|
||||
- disable-k3s-traefik-daemonset.yaml
|
||||
|
||||
@ -0,0 +1,61 @@
|
||||
# services/maintenance/oneoffs/titan-24-lesavka-desktop-helper-cleanup-job.yaml
|
||||
# One-off cleanup for the temporary Lesavka paste-test desktop on titan-24.
|
||||
# Run by setting spec.suspend to false, reconcile, then set it back to true.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: titan-24-lesavka-desktop-helper-cleanup
|
||||
namespace: maintenance
|
||||
annotations:
|
||||
kustomize.toolkit.fluxcd.io/force: "true"
|
||||
spec:
|
||||
suspend: true
|
||||
backoffLimit: 1
|
||||
ttlSecondsAfterFinished: 3600
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: titan-24-lesavka-desktop-helper-cleanup
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: titan-24
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/not-ready
|
||||
operator: Exists
|
||||
effect: NoExecute
|
||||
tolerationSeconds: 300
|
||||
- key: node.kubernetes.io/unreachable
|
||||
operator: Exists
|
||||
effect: NoExecute
|
||||
tolerationSeconds: 300
|
||||
hostPID: true
|
||||
containers:
|
||||
- name: cleanup
|
||||
image: debian:13-slim
|
||||
securityContext:
|
||||
privileged: true
|
||||
runAsUser: 0
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
set -euo pipefail
|
||||
chroot /host /usr/bin/env bash <<'EOS'
|
||||
set -euo pipefail
|
||||
username="lesavka-test"
|
||||
rm -f /etc/sddm.conf.d/60-lesavka-test-autologin.conf
|
||||
if id "${username}" >/dev/null 2>&1; then
|
||||
loginctl terminate-user "${username}" >/dev/null 2>&1 || true
|
||||
userdel -r "${username}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
EOS
|
||||
|
||||
nsenter -t 1 -m -u -i -n -p -- systemctl restart sddm || \
|
||||
nsenter -t 1 -m -u -i -n -p -- systemctl restart display-manager
|
||||
volumeMounts:
|
||||
- name: host-root
|
||||
mountPath: /host
|
||||
volumes:
|
||||
- name: host-root
|
||||
hostPath:
|
||||
path: /
|
||||
@ -0,0 +1,118 @@
|
||||
# services/maintenance/oneoffs/titan-24-lesavka-desktop-helper-job.yaml
|
||||
# One-off job to create a temporary autologin desktop for Lesavka paste testing on titan-24.
|
||||
# Safe to delete the finished Job/pod after it succeeds.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: titan-24-lesavka-desktop-helper
|
||||
namespace: maintenance
|
||||
annotations:
|
||||
kustomize.toolkit.fluxcd.io/force: "true"
|
||||
spec:
|
||||
backoffLimit: 1
|
||||
ttlSecondsAfterFinished: 3600
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: titan-24-lesavka-desktop-helper
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: titan-24
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/not-ready
|
||||
operator: Exists
|
||||
effect: NoExecute
|
||||
tolerationSeconds: 300
|
||||
- key: node.kubernetes.io/unreachable
|
||||
operator: Exists
|
||||
effect: NoExecute
|
||||
tolerationSeconds: 300
|
||||
hostPID: true
|
||||
containers:
|
||||
- name: setup
|
||||
image: debian:13-slim
|
||||
securityContext:
|
||||
privileged: true
|
||||
runAsUser: 0
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
set -euo pipefail
|
||||
chroot /host /usr/bin/env bash <<'EOS'
|
||||
set -euo pipefail
|
||||
username="lesavka-test"
|
||||
home="/home/${username}"
|
||||
session_name="plasmax11.desktop"
|
||||
|
||||
if ! id "${username}" >/dev/null 2>&1; then
|
||||
useradd -m -s /bin/bash "${username}"
|
||||
fi
|
||||
passwd -l "${username}" >/dev/null 2>&1 || true
|
||||
|
||||
for group in audio video render input netdev plugdev; do
|
||||
if getent group "${group}" >/dev/null 2>&1; then
|
||||
usermod -a -G "${group}" "${username}"
|
||||
fi
|
||||
done
|
||||
|
||||
install -d -m 755 /etc/sddm.conf.d
|
||||
printf '%s\n' \
|
||||
'[Autologin]' \
|
||||
"User=${username}" \
|
||||
"Session=${session_name}" \
|
||||
'Relogin=false' \
|
||||
>/etc/sddm.conf.d/60-lesavka-test-autologin.conf
|
||||
|
||||
install -d -o "${username}" -g "${username}" -m 700 \
|
||||
"${home}/.config/autostart" \
|
||||
"${home}/.local/bin"
|
||||
install -o "${username}" -g "${username}" -m 644 /dev/null "${home}/lesavka-paste-test.txt"
|
||||
|
||||
printf '%s\n' \
|
||||
'#!/usr/bin/env bash' \
|
||||
'set -euo pipefail' \
|
||||
'cd "${HOME}"' \
|
||||
'touch "${HOME}/lesavka-paste-test.txt"' \
|
||||
'if command -v kate >/dev/null 2>&1; then' \
|
||||
' exec kate "${HOME}/lesavka-paste-test.txt"' \
|
||||
'fi' \
|
||||
'if command -v kwrite >/dev/null 2>&1; then' \
|
||||
' exec kwrite "${HOME}/lesavka-paste-test.txt"' \
|
||||
'fi' \
|
||||
'if command -v gedit >/dev/null 2>&1; then' \
|
||||
' exec gedit "${HOME}/lesavka-paste-test.txt"' \
|
||||
'fi' \
|
||||
'if command -v mousepad >/dev/null 2>&1; then' \
|
||||
' exec mousepad "${HOME}/lesavka-paste-test.txt"' \
|
||||
'fi' \
|
||||
'if command -v xterm >/dev/null 2>&1; then' \
|
||||
' exec xterm -fa Monospace -fs 14 -e sh -lc "exec ${EDITOR:-vi} '\''${HOME}/lesavka-paste-test.txt'\''"' \
|
||||
'fi' \
|
||||
'exit 0' \
|
||||
>"${home}/.local/bin/lesavka-test-launch.sh"
|
||||
chmod 755 "${home}/.local/bin/lesavka-test-launch.sh"
|
||||
|
||||
printf '%s\n' \
|
||||
'[Desktop Entry]' \
|
||||
'Type=Application' \
|
||||
'Version=1.0' \
|
||||
'Name=Lesavka Paste Test' \
|
||||
'Comment=Open a visible editor for Lesavka clipboard testing' \
|
||||
'Exec=/home/lesavka-test/.local/bin/lesavka-test-launch.sh' \
|
||||
'Terminal=false' \
|
||||
'X-GNOME-Autostart-enabled=true' \
|
||||
>"${home}/.config/autostart/lesavka-test.desktop"
|
||||
|
||||
chown -R "${username}:${username}" "${home}/.config" "${home}/.local" "${home}/lesavka-paste-test.txt"
|
||||
EOS
|
||||
|
||||
nsenter -t 1 -m -u -i -n -p -- systemctl restart sddm || \
|
||||
nsenter -t 1 -m -u -i -n -p -- systemctl restart display-manager
|
||||
volumeMounts:
|
||||
- name: host-root
|
||||
mountPath: /host
|
||||
volumes:
|
||||
- name: host-root
|
||||
hostPath:
|
||||
path: /
|
||||
64
services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml
Normal file
64
services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml
Normal file
@ -0,0 +1,64 @@
|
||||
# services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml
|
||||
# One-off emergency cleanup for titan-24 rootfs pressure.
|
||||
# Safe to delete the finished Job/pod after it succeeds.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: titan-24-rootfs-sweep
|
||||
namespace: maintenance
|
||||
annotations:
|
||||
kustomize.toolkit.fluxcd.io/force: "true"
|
||||
spec:
|
||||
backoffLimit: 1
|
||||
ttlSecondsAfterFinished: 3600
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: titan-24-rootfs-sweep
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: titan-24
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/not-ready
|
||||
operator: Exists
|
||||
effect: NoExecute
|
||||
tolerationSeconds: 300
|
||||
- key: node.kubernetes.io/unreachable
|
||||
operator: Exists
|
||||
effect: NoExecute
|
||||
tolerationSeconds: 300
|
||||
containers:
|
||||
- name: sweep
|
||||
image: python:3.12.9-alpine3.20
|
||||
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
|
||||
env:
|
||||
- name: ONE_SHOT
|
||||
value: "true"
|
||||
- name: HIGH_USAGE_PERCENT
|
||||
value: "0"
|
||||
- name: EMERGENCY_USAGE_PERCENT
|
||||
value: "0"
|
||||
- name: LOG_RETENTION_DAYS
|
||||
value: "1"
|
||||
- name: ORPHAN_POD_RETENTION_DAYS
|
||||
value: "0"
|
||||
- name: JOURNAL_MAX_SIZE
|
||||
value: "100M"
|
||||
securityContext:
|
||||
privileged: true
|
||||
runAsUser: 0
|
||||
volumeMounts:
|
||||
- name: host-root
|
||||
mountPath: /host
|
||||
- name: script
|
||||
mountPath: /scripts
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: host-root
|
||||
hostPath:
|
||||
path: /
|
||||
- name: script
|
||||
configMap:
|
||||
name: node-image-sweeper-script
|
||||
defaultMode: 0555
|
||||
@ -51,6 +51,48 @@ for name in hdd_names:
|
||||
PY
|
||||
}
|
||||
|
||||
cleanup_orphaned_root_pod_logs() {
|
||||
if [ ! -d /host/var/log/pods ] || [ ! -d /host/var/lib/kubelet/pods ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY'
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
|
||||
root_pods = "/host/var/log/pods"
|
||||
active_pods = "/host/var/lib/kubelet/pods"
|
||||
retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3"))
|
||||
cutoff = time.time() - (retention_days * 86400)
|
||||
|
||||
try:
|
||||
active_names = set(os.listdir(active_pods))
|
||||
except Exception:
|
||||
active_names = set()
|
||||
|
||||
try:
|
||||
root_names = os.listdir(root_pods)
|
||||
except Exception:
|
||||
root_names = []
|
||||
|
||||
for name in root_names:
|
||||
path = os.path.join(root_pods, name)
|
||||
if not os.path.isdir(path):
|
||||
continue
|
||||
if name in active_names:
|
||||
continue
|
||||
try:
|
||||
mtime = os.path.getmtime(path)
|
||||
except Exception:
|
||||
continue
|
||||
if mtime > cutoff:
|
||||
continue
|
||||
print(path)
|
||||
shutil.rmtree(path, ignore_errors=True)
|
||||
PY
|
||||
}
|
||||
|
||||
sweep_once() {
|
||||
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
|
||||
|
||||
@ -61,6 +103,7 @@ sweep_once() {
|
||||
fi
|
||||
|
||||
cleanup_orphaned_hdd_pod_logs
|
||||
cleanup_orphaned_root_pod_logs
|
||||
|
||||
if [ -d /host/var/log.hdd/pods ]; then
|
||||
find /host/var/log.hdd/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user