recovery(ananke): finalize flux holds without races

This commit is contained in:
jenkins 2026-06-18 19:11:22 -03:00
parent e893af2a55
commit 32681728c0

View File

@ -1039,6 +1039,54 @@ recovery_flux_unsuspended_list() {
done
}
wait_for_kustomize_controller_scaled_down() {
[[ "${EXECUTE}" -eq 1 ]] || return 0
local deadline pods
deadline=$((SECONDS + 90))
while (( SECONDS < deadline )); do
pods="$(kubectl -n flux-system get pods -l app=kustomize-controller -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
if [[ -z "${pods//[[:space:]]/}" ]]; then
return 0
fi
sleep 2
done
warn "Timed out waiting for kustomize-controller pods to terminate before final Flux suspend reassertion."
return 1
}
force_recovery_flux_suspend_with_controller_stop() {
[[ "${EXECUTE}" -eq 1 ]] || return 0
if ! kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then
warn "kustomize-controller deployment not found; cannot use controller-stop Flux suspend finalization."
return 1
fi
warn "Stopping kustomize-controller for final Flux suspend reassertion."
run kubectl -n flux-system scale deployment kustomize-controller --replicas=0
wait_for_kustomize_controller_scaled_down || true
if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
patch_kustomization_suspend flux-system true
fi
patch_recovery_optional_flux_suspend_without_snapshot true
run kubectl -n flux-system scale deployment kustomize-controller --replicas=1
kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=2m || warn "kustomize-controller did not become Ready after final Flux suspend reassertion."
sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}"
local unsuspended
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
if [[ -n "${unsuspended}" ]]; then
warn "Flux suspend state is still not stable after controller-stop finalization: ${unsuspended}"
return 1
fi
log "recovery-flux-suspend=verified-controller-stop"
}
stabilize_recovery_flux_suspend() {
[[ "${EXECUTE}" -eq 1 ]] || return 0
@ -1061,7 +1109,7 @@ stabilize_recovery_flux_suspend() {
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
if [[ -n "${unsuspended}" ]]; then
warn "Flux suspend state is still not stable after verification attempts: ${unsuspended}"
return 1
force_recovery_flux_suspend_with_controller_stop
fi
}
@ -1522,7 +1570,6 @@ resume_deadlock_automation_after_core_recovery() {
restart_kustomize_controller_for_critical_thaw
annotate_flux_kustomizations "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}" || true
stabilize_recovery_flux_suspend || true
restart_kustomize_controller_for_critical_thaw
mark_checkpoint longhorn_unlock_automation_resumed
}