diff --git a/scripts/cluster_power_recovery.sh b/scripts/cluster_power_recovery.sh index 30078f84..f444fd5e 100755 --- a/scripts/cluster_power_recovery.sh +++ b/scripts/cluster_power_recovery.sh @@ -141,6 +141,8 @@ RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS="${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS:- RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE="${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE:-${HOME}/${STATE_SUBDIR:-.local/share/ananke}/longhorn_unlock_optional_flux.tsv}" RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER="${RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER:-1}" RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION="${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION:-1}" +RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS="${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS:-6}" +RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS="${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS:-10}" STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS:-10}" STARTUP_SERVICE_CHECKLIST="${STARTUP_SERVICE_CHECKLIST:-}" STARTUP_INCLUDE_INGRESS_CHECKS="${STARTUP_INCLUDE_INGRESS_CHECKS:-1}" @@ -1008,6 +1010,61 @@ patch_recovery_optional_flux_suspend() { done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}") } +patch_recovery_optional_flux_suspend_without_snapshot() { + local value="$1" + local name + while IFS= read -r name; do + patch_kustomization_suspend "${name}" "${value}" + done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}") +} + +recovery_flux_unsuspended_list() { + local names=() + local name suspend + if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then + names+=("flux-system") + fi + while IFS= read -r name; do + names+=("${name}") + done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}") + + for name in "${names[@]}"; do + if ! kubectl -n flux-system get kustomization "${name}" >/dev/null 2>&1; then + continue + fi + suspend="$(kubectl -n flux-system get kustomization "${name}" -o jsonpath='{.spec.suspend}' 2>/dev/null || true)" + if [[ "${suspend}" != "true" ]]; then + printf '%s\n' "${name}" + fi + done +} + +stabilize_recovery_flux_suspend() { + [[ "${EXECUTE}" -eq 1 ]] || return 0 + + local attempt unsuspended + for attempt in $(seq 1 "${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS}"); do + if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then + patch_kustomization_suspend flux-system true + fi + patch_recovery_optional_flux_suspend_without_snapshot true + sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}" + + unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)" + if [[ -z "${unsuspended}" ]]; then + log "recovery-flux-suspend=verified attempts=${attempt}" + return 0 + fi + warn "Flux suspend state was overwritten during recovery thaw; reasserting attempt ${attempt}/${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS}: ${unsuspended}" + done + + unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)" + if [[ -n "${unsuspended}" ]]; then + warn "Flux suspend state is still not stable after verification attempts: ${unsuspended}" + return 1 + fi +} + restore_recovery_optional_flux_suspend() { [[ -f "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}" ]] || return 0 local name suspend @@ -1464,6 +1521,8 @@ resume_deadlock_automation_after_core_recovery() { patch_recovery_optional_flux_suspend true restart_kustomize_controller_for_critical_thaw annotate_flux_kustomizations "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}" || true + stabilize_recovery_flux_suspend || true + restart_kustomize_controller_for_critical_thaw mark_checkpoint longhorn_unlock_automation_resumed }