#!/usr/bin/env bash # Soft recovery ladder for a deployed Lesavka server. set -euo pipefail ACTION=${1:-recover} LOG_PATH=${LESAVKA_RECOVERY_LOG:-/var/log/lesavka/recovery-ladder.log} LAST_GOOD_DIR=${LESAVKA_RECOVERY_LAST_GOOD_DIR:-/var/lib/lesavka/recovery/last-good} CHECK_TIMEOUT_SECONDS=${LESAVKA_RECOVERY_TIMEOUT_SECONDS:-60} ALLOW_CORE_RESTART=${LESAVKA_RECOVERY_ALLOW_CORE_RESTART:-0} ALLOW_REBOOT=${LESAVKA_RECOVERY_ALLOW_REBOOT:-0} SERVER_BIND_ADDR=${LESAVKA_SERVER_BIND_ADDR:-0.0.0.0:50051} LOCK_PATH=${LESAVKA_RECOVERY_LOCK:-/run/lesavka-recovery-ladder.lock} LIVE_FILES=( /usr/local/bin/lesavka-server /usr/local/bin/lesavka-uvc /usr/local/bin/lesavka-core.sh /usr/local/bin/lesavka-uvc.sh ) mkdir -p "$(dirname "$LOG_PATH")" "$LAST_GOOD_DIR" "$(dirname "$LOCK_PATH")" log() { printf '%s %s\n' "$(date -Is)" "$*" | tee -a "$LOG_PATH" >&2 } locked_main() { exec 9>"$LOCK_PATH" if ! flock -n 9; then log "another recovery ladder run is already active; skipping" exit 0 fi main "$@" } nonempty_executable() { [[ -s $1 && -x $1 ]] } server_port() { printf '%s\n' "${SERVER_BIND_ADDR##*:}" } listener_ready() { local port port=$(server_port) if ! command -v ss >/dev/null 2>&1; then return 0 fi ss -ltn "sport = :$port" 2>/dev/null | grep -q ":$port" } entrypoints_ready() { local file for file in "${LIVE_FILES[@]}"; do if ! nonempty_executable "$file"; then log "unhealthy: $file is missing, empty, or not executable" return 1 fi done } services_ready() { systemctl is-active --quiet lesavka-core.service \ && systemctl is-active --quiet lesavka-uvc.service \ && systemctl is-active --quiet lesavka-server.service } health_ready() { entrypoints_ready && services_ready && listener_ready } snapshot_last_good() { local file dest entrypoints_ready install -d -m 0755 "$LAST_GOOD_DIR/usr-local-bin" for file in "${LIVE_FILES[@]}"; do dest="$LAST_GOOD_DIR/usr-local-bin/$(basename "$file")" install -m 0755 "$file" "$dest" done log "snapshot: refreshed last-known-good Lesavka entrypoints" } restore_last_good() { local file src for file in "${LIVE_FILES[@]}"; do src="$LAST_GOOD_DIR/usr-local-bin/$(basename "$file")" if ! nonempty_executable "$src"; then log "restore: missing last-known-good copy for $(basename "$file")" return 1 fi done for file in "${LIVE_FILES[@]}"; do src="$LAST_GOOD_DIR/usr-local-bin/$(basename "$file")" install -m 0755 "$src" "$file" done log "restore: restored last-known-good Lesavka entrypoints" } wait_for_health() { local deadline deadline=$((SECONDS + CHECK_TIMEOUT_SECONDS)) while (( SECONDS <= deadline )); do if health_ready; then return 0 fi sleep 5 done return 1 } restart_server_only() { log "step 1: restarting lesavka-server only" systemctl reset-failed lesavka-server.service >/dev/null 2>&1 || true systemctl restart lesavka-server.service } restart_uvc_and_server() { log "step 2: restarting UVC helper and server" systemctl reset-failed lesavka-uvc.service lesavka-server.service >/dev/null 2>&1 || true systemctl restart lesavka-uvc.service systemctl restart lesavka-server.service } restart_full_stack_if_allowed() { if [[ $ALLOW_CORE_RESTART == 0 || $ALLOW_CORE_RESTART == false || $ALLOW_CORE_RESTART == no ]]; then log "step 4: core restart disabled; preserving attached USB gadget" return 1 fi log "step 4: restarting full stack because LESAVKA_RECOVERY_ALLOW_CORE_RESTART is enabled" systemctl reset-failed lesavka-core.service lesavka-uvc.service lesavka-server.service >/dev/null 2>&1 || true systemctl restart lesavka-core.service systemctl restart lesavka-uvc.service systemctl restart lesavka-server.service } reboot_if_allowed() { if [[ $ALLOW_REBOOT == 0 || $ALLOW_REBOOT == false || $ALLOW_REBOOT == no ]]; then log "step 5: reboot disabled; leaving host online for operator inspection" return 1 fi log "step 5: rebooting because LESAVKA_RECOVERY_ALLOW_REBOOT is enabled" systemctl reboot } recover() { if health_ready; then snapshot_last_good log "healthy: no recovery needed" return 0 fi log "unhealthy: waiting up to ${CHECK_TIMEOUT_SECONDS}s before recovery" if wait_for_health; then snapshot_last_good log "healthy after wait: no recovery needed" return 0 fi restart_server_only || true if wait_for_health; then snapshot_last_good return 0 fi restart_uvc_and_server || true if wait_for_health; then snapshot_last_good return 0 fi log "step 3: restoring last-known-good entrypoints" if restore_last_good; then restart_uvc_and_server || true if wait_for_health; then snapshot_last_good return 0 fi fi restart_full_stack_if_allowed || true if wait_for_health; then snapshot_last_good return 0 fi reboot_if_allowed || return 1 } main() { case "$ACTION" in check) health_ready ;; snapshot) snapshot_last_good ;; restore) restore_last_good ;; recover) recover ;; *) echo "usage: $0 {check|snapshot|restore|recover}" >&2 exit 2 ;; esac } locked_main "$@"