diff --git a/scripts/cluster_power_recovery.sh b/scripts/cluster_power_recovery.sh index f444fd5e..c39d094b 100755 --- a/scripts/cluster_power_recovery.sh +++ b/scripts/cluster_power_recovery.sh @@ -1039,6 +1039,54 @@ recovery_flux_unsuspended_list() { done } +wait_for_kustomize_controller_scaled_down() { + [[ "${EXECUTE}" -eq 1 ]] || return 0 + + local deadline pods + deadline=$((SECONDS + 90)) + while (( SECONDS < deadline )); do + pods="$(kubectl -n flux-system get pods -l app=kustomize-controller -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)" + if [[ -z "${pods//[[:space:]]/}" ]]; then + return 0 + fi + sleep 2 + done + + warn "Timed out waiting for kustomize-controller pods to terminate before final Flux suspend reassertion." + return 1 +} + +force_recovery_flux_suspend_with_controller_stop() { + [[ "${EXECUTE}" -eq 1 ]] || return 0 + + if ! kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then + warn "kustomize-controller deployment not found; cannot use controller-stop Flux suspend finalization." + return 1 + fi + + warn "Stopping kustomize-controller for final Flux suspend reassertion." + run kubectl -n flux-system scale deployment kustomize-controller --replicas=0 + wait_for_kustomize_controller_scaled_down || true + + if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then + patch_kustomization_suspend flux-system true + fi + patch_recovery_optional_flux_suspend_without_snapshot true + + run kubectl -n flux-system scale deployment kustomize-controller --replicas=1 + kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=2m || warn "kustomize-controller did not become Ready after final Flux suspend reassertion." + sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}" + + local unsuspended + unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)" + if [[ -n "${unsuspended}" ]]; then + warn "Flux suspend state is still not stable after controller-stop finalization: ${unsuspended}" + return 1 + fi + + log "recovery-flux-suspend=verified-controller-stop" +} + stabilize_recovery_flux_suspend() { [[ "${EXECUTE}" -eq 1 ]] || return 0 @@ -1061,7 +1109,7 @@ stabilize_recovery_flux_suspend() { unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)" if [[ -n "${unsuspended}" ]]; then warn "Flux suspend state is still not stable after verification attempts: ${unsuspended}" - return 1 + force_recovery_flux_suspend_with_controller_stop fi } @@ -1522,7 +1570,6 @@ resume_deadlock_automation_after_core_recovery() { restart_kustomize_controller_for_critical_thaw annotate_flux_kustomizations "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}" || true stabilize_recovery_flux_suspend || true - restart_kustomize_controller_for_critical_thaw mark_checkpoint longhorn_unlock_automation_resumed }