lesavka/scripts/daemon/lesavka-recovery-ladder.sh

212 lines
5.1 KiB
Bash
Executable File

#!/usr/bin/env bash
# Soft recovery ladder for a deployed Lesavka server.
set -euo pipefail
ACTION=${1:-recover}
LOG_PATH=${LESAVKA_RECOVERY_LOG:-/var/log/lesavka/recovery-ladder.log}
LAST_GOOD_DIR=${LESAVKA_RECOVERY_LAST_GOOD_DIR:-/var/lib/lesavka/recovery/last-good}
CHECK_TIMEOUT_SECONDS=${LESAVKA_RECOVERY_TIMEOUT_SECONDS:-60}
ALLOW_CORE_RESTART=${LESAVKA_RECOVERY_ALLOW_CORE_RESTART:-0}
ALLOW_REBOOT=${LESAVKA_RECOVERY_ALLOW_REBOOT:-0}
SERVER_BIND_ADDR=${LESAVKA_SERVER_BIND_ADDR:-0.0.0.0:50051}
LOCK_PATH=${LESAVKA_RECOVERY_LOCK:-/run/lesavka-recovery-ladder.lock}
LIVE_FILES=(
/usr/local/bin/lesavka-server
/usr/local/bin/lesavka-uvc
/usr/local/bin/lesavka-core.sh
/usr/local/bin/lesavka-uvc.sh
)
mkdir -p "$(dirname "$LOG_PATH")" "$LAST_GOOD_DIR" "$(dirname "$LOCK_PATH")"
log() {
printf '%s %s\n' "$(date -Is)" "$*" | tee -a "$LOG_PATH" >&2
}
locked_main() {
exec 9>"$LOCK_PATH"
if ! flock -n 9; then
log "another recovery ladder run is already active; skipping"
exit 0
fi
main "$@"
}
nonempty_executable() {
[[ -s $1 && -x $1 ]]
}
server_port() {
printf '%s\n' "${SERVER_BIND_ADDR##*:}"
}
listener_ready() {
local port
port=$(server_port)
if ! command -v ss >/dev/null 2>&1; then
return 0
fi
ss -ltn "sport = :$port" 2>/dev/null | grep -q ":$port"
}
entrypoints_ready() {
local file
for file in "${LIVE_FILES[@]}"; do
if ! nonempty_executable "$file"; then
log "unhealthy: $file is missing, empty, or not executable"
return 1
fi
done
}
services_ready() {
systemctl is-active --quiet lesavka-core.service \
&& systemctl is-active --quiet lesavka-uvc.service \
&& systemctl is-active --quiet lesavka-server.service
}
health_ready() {
entrypoints_ready && services_ready && listener_ready
}
snapshot_last_good() {
local file dest
entrypoints_ready
install -d -m 0755 "$LAST_GOOD_DIR/usr-local-bin"
for file in "${LIVE_FILES[@]}"; do
dest="$LAST_GOOD_DIR/usr-local-bin/$(basename "$file")"
install -m 0755 "$file" "$dest"
done
log "snapshot: refreshed last-known-good Lesavka entrypoints"
}
restore_last_good() {
local file src
for file in "${LIVE_FILES[@]}"; do
src="$LAST_GOOD_DIR/usr-local-bin/$(basename "$file")"
if ! nonempty_executable "$src"; then
log "restore: missing last-known-good copy for $(basename "$file")"
return 1
fi
done
for file in "${LIVE_FILES[@]}"; do
src="$LAST_GOOD_DIR/usr-local-bin/$(basename "$file")"
install -m 0755 "$src" "$file"
done
log "restore: restored last-known-good Lesavka entrypoints"
}
wait_for_health() {
local deadline
deadline=$((SECONDS + CHECK_TIMEOUT_SECONDS))
while (( SECONDS <= deadline )); do
if health_ready; then
return 0
fi
sleep 5
done
return 1
}
restart_server_only() {
log "step 1: restarting lesavka-server only"
systemctl reset-failed lesavka-server.service >/dev/null 2>&1 || true
systemctl restart lesavka-server.service
}
restart_uvc_and_server() {
log "step 2: restarting UVC helper and server"
systemctl reset-failed lesavka-uvc.service lesavka-server.service >/dev/null 2>&1 || true
systemctl restart lesavka-uvc.service
systemctl restart lesavka-server.service
}
restart_full_stack_if_allowed() {
if [[ $ALLOW_CORE_RESTART == 0 || $ALLOW_CORE_RESTART == false || $ALLOW_CORE_RESTART == no ]]; then
log "step 4: core restart disabled; preserving attached USB gadget"
return 1
fi
log "step 4: restarting full stack because LESAVKA_RECOVERY_ALLOW_CORE_RESTART is enabled"
systemctl reset-failed lesavka-core.service lesavka-uvc.service lesavka-server.service >/dev/null 2>&1 || true
systemctl restart lesavka-core.service
systemctl restart lesavka-uvc.service
systemctl restart lesavka-server.service
}
reboot_if_allowed() {
if [[ $ALLOW_REBOOT == 0 || $ALLOW_REBOOT == false || $ALLOW_REBOOT == no ]]; then
log "step 5: reboot disabled; leaving host online for operator inspection"
return 1
fi
log "step 5: rebooting because LESAVKA_RECOVERY_ALLOW_REBOOT is enabled"
systemctl reboot
}
recover() {
if health_ready; then
snapshot_last_good
log "healthy: no recovery needed"
return 0
fi
log "unhealthy: waiting up to ${CHECK_TIMEOUT_SECONDS}s before recovery"
if wait_for_health; then
snapshot_last_good
log "healthy after wait: no recovery needed"
return 0
fi
restart_server_only || true
if wait_for_health; then
snapshot_last_good
return 0
fi
restart_uvc_and_server || true
if wait_for_health; then
snapshot_last_good
return 0
fi
log "step 3: restoring last-known-good entrypoints"
if restore_last_good; then
restart_uvc_and_server || true
if wait_for_health; then
snapshot_last_good
return 0
fi
fi
restart_full_stack_if_allowed || true
if wait_for_health; then
snapshot_last_good
return 0
fi
reboot_if_allowed || return 1
}
main() {
case "$ACTION" in
check)
health_ready
;;
snapshot)
snapshot_last_good
;;
restore)
restore_last_good
;;
recover)
recover
;;
*)
echo "usage: $0 {check|snapshot|restore|recover}" >&2
exit 2
;;
esac
}
locked_main "$@"