212 lines
5.1 KiB
Bash
Executable File
212 lines
5.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Soft recovery ladder for a deployed Lesavka server.
|
|
set -euo pipefail
|
|
|
|
ACTION=${1:-recover}
|
|
LOG_PATH=${LESAVKA_RECOVERY_LOG:-/var/log/lesavka/recovery-ladder.log}
|
|
LAST_GOOD_DIR=${LESAVKA_RECOVERY_LAST_GOOD_DIR:-/var/lib/lesavka/recovery/last-good}
|
|
CHECK_TIMEOUT_SECONDS=${LESAVKA_RECOVERY_TIMEOUT_SECONDS:-60}
|
|
ALLOW_CORE_RESTART=${LESAVKA_RECOVERY_ALLOW_CORE_RESTART:-0}
|
|
ALLOW_REBOOT=${LESAVKA_RECOVERY_ALLOW_REBOOT:-0}
|
|
SERVER_BIND_ADDR=${LESAVKA_SERVER_BIND_ADDR:-0.0.0.0:50051}
|
|
LOCK_PATH=${LESAVKA_RECOVERY_LOCK:-/run/lesavka-recovery-ladder.lock}
|
|
|
|
LIVE_FILES=(
|
|
/usr/local/bin/lesavka-server
|
|
/usr/local/bin/lesavka-uvc
|
|
/usr/local/bin/lesavka-core.sh
|
|
/usr/local/bin/lesavka-uvc.sh
|
|
)
|
|
|
|
mkdir -p "$(dirname "$LOG_PATH")" "$LAST_GOOD_DIR" "$(dirname "$LOCK_PATH")"
|
|
|
|
log() {
|
|
printf '%s %s\n' "$(date -Is)" "$*" | tee -a "$LOG_PATH" >&2
|
|
}
|
|
|
|
locked_main() {
|
|
exec 9>"$LOCK_PATH"
|
|
if ! flock -n 9; then
|
|
log "another recovery ladder run is already active; skipping"
|
|
exit 0
|
|
fi
|
|
main "$@"
|
|
}
|
|
|
|
nonempty_executable() {
|
|
[[ -s $1 && -x $1 ]]
|
|
}
|
|
|
|
server_port() {
|
|
printf '%s\n' "${SERVER_BIND_ADDR##*:}"
|
|
}
|
|
|
|
listener_ready() {
|
|
local port
|
|
port=$(server_port)
|
|
if ! command -v ss >/dev/null 2>&1; then
|
|
return 0
|
|
fi
|
|
ss -ltn "sport = :$port" 2>/dev/null | grep -q ":$port"
|
|
}
|
|
|
|
entrypoints_ready() {
|
|
local file
|
|
for file in "${LIVE_FILES[@]}"; do
|
|
if ! nonempty_executable "$file"; then
|
|
log "unhealthy: $file is missing, empty, or not executable"
|
|
return 1
|
|
fi
|
|
done
|
|
}
|
|
|
|
services_ready() {
|
|
systemctl is-active --quiet lesavka-core.service \
|
|
&& systemctl is-active --quiet lesavka-uvc.service \
|
|
&& systemctl is-active --quiet lesavka-server.service
|
|
}
|
|
|
|
health_ready() {
|
|
entrypoints_ready && services_ready && listener_ready
|
|
}
|
|
|
|
snapshot_last_good() {
|
|
local file dest
|
|
entrypoints_ready
|
|
install -d -m 0755 "$LAST_GOOD_DIR/usr-local-bin"
|
|
for file in "${LIVE_FILES[@]}"; do
|
|
dest="$LAST_GOOD_DIR/usr-local-bin/$(basename "$file")"
|
|
install -m 0755 "$file" "$dest"
|
|
done
|
|
log "snapshot: refreshed last-known-good Lesavka entrypoints"
|
|
}
|
|
|
|
restore_last_good() {
|
|
local file src
|
|
for file in "${LIVE_FILES[@]}"; do
|
|
src="$LAST_GOOD_DIR/usr-local-bin/$(basename "$file")"
|
|
if ! nonempty_executable "$src"; then
|
|
log "restore: missing last-known-good copy for $(basename "$file")"
|
|
return 1
|
|
fi
|
|
done
|
|
for file in "${LIVE_FILES[@]}"; do
|
|
src="$LAST_GOOD_DIR/usr-local-bin/$(basename "$file")"
|
|
install -m 0755 "$src" "$file"
|
|
done
|
|
log "restore: restored last-known-good Lesavka entrypoints"
|
|
}
|
|
|
|
wait_for_health() {
|
|
local deadline
|
|
deadline=$((SECONDS + CHECK_TIMEOUT_SECONDS))
|
|
while (( SECONDS <= deadline )); do
|
|
if health_ready; then
|
|
return 0
|
|
fi
|
|
sleep 5
|
|
done
|
|
return 1
|
|
}
|
|
|
|
restart_server_only() {
|
|
log "step 1: restarting lesavka-server only"
|
|
systemctl reset-failed lesavka-server.service >/dev/null 2>&1 || true
|
|
systemctl restart lesavka-server.service
|
|
}
|
|
|
|
restart_uvc_and_server() {
|
|
log "step 2: restarting UVC helper and server"
|
|
systemctl reset-failed lesavka-uvc.service lesavka-server.service >/dev/null 2>&1 || true
|
|
systemctl restart lesavka-uvc.service
|
|
systemctl restart lesavka-server.service
|
|
}
|
|
|
|
restart_full_stack_if_allowed() {
|
|
if [[ $ALLOW_CORE_RESTART == 0 || $ALLOW_CORE_RESTART == false || $ALLOW_CORE_RESTART == no ]]; then
|
|
log "step 4: core restart disabled; preserving attached USB gadget"
|
|
return 1
|
|
fi
|
|
log "step 4: restarting full stack because LESAVKA_RECOVERY_ALLOW_CORE_RESTART is enabled"
|
|
systemctl reset-failed lesavka-core.service lesavka-uvc.service lesavka-server.service >/dev/null 2>&1 || true
|
|
systemctl restart lesavka-core.service
|
|
systemctl restart lesavka-uvc.service
|
|
systemctl restart lesavka-server.service
|
|
}
|
|
|
|
reboot_if_allowed() {
|
|
if [[ $ALLOW_REBOOT == 0 || $ALLOW_REBOOT == false || $ALLOW_REBOOT == no ]]; then
|
|
log "step 5: reboot disabled; leaving host online for operator inspection"
|
|
return 1
|
|
fi
|
|
log "step 5: rebooting because LESAVKA_RECOVERY_ALLOW_REBOOT is enabled"
|
|
systemctl reboot
|
|
}
|
|
|
|
recover() {
|
|
if health_ready; then
|
|
snapshot_last_good
|
|
log "healthy: no recovery needed"
|
|
return 0
|
|
fi
|
|
|
|
log "unhealthy: waiting up to ${CHECK_TIMEOUT_SECONDS}s before recovery"
|
|
if wait_for_health; then
|
|
snapshot_last_good
|
|
log "healthy after wait: no recovery needed"
|
|
return 0
|
|
fi
|
|
|
|
restart_server_only || true
|
|
if wait_for_health; then
|
|
snapshot_last_good
|
|
return 0
|
|
fi
|
|
|
|
restart_uvc_and_server || true
|
|
if wait_for_health; then
|
|
snapshot_last_good
|
|
return 0
|
|
fi
|
|
|
|
log "step 3: restoring last-known-good entrypoints"
|
|
if restore_last_good; then
|
|
restart_uvc_and_server || true
|
|
if wait_for_health; then
|
|
snapshot_last_good
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
restart_full_stack_if_allowed || true
|
|
if wait_for_health; then
|
|
snapshot_last_good
|
|
return 0
|
|
fi
|
|
|
|
reboot_if_allowed || return 1
|
|
}
|
|
|
|
main() {
|
|
case "$ACTION" in
|
|
check)
|
|
health_ready
|
|
;;
|
|
snapshot)
|
|
snapshot_last_good
|
|
;;
|
|
restore)
|
|
restore_last_good
|
|
;;
|
|
recover)
|
|
recover
|
|
;;
|
|
*)
|
|
echo "usage: $0 {check|snapshot|restore|recover}" >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
}
|
|
|
|
locked_main "$@"
|