recovery(ananke): finalize flux holds without races
This commit is contained in:
parent
e893af2a55
commit
32681728c0
@ -1039,6 +1039,54 @@ recovery_flux_unsuspended_list() {
|
||||
done
|
||||
}
|
||||
|
||||
wait_for_kustomize_controller_scaled_down() {
|
||||
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
||||
|
||||
local deadline pods
|
||||
deadline=$((SECONDS + 90))
|
||||
while (( SECONDS < deadline )); do
|
||||
pods="$(kubectl -n flux-system get pods -l app=kustomize-controller -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
|
||||
if [[ -z "${pods//[[:space:]]/}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
warn "Timed out waiting for kustomize-controller pods to terminate before final Flux suspend reassertion."
|
||||
return 1
|
||||
}
|
||||
|
||||
force_recovery_flux_suspend_with_controller_stop() {
|
||||
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
||||
|
||||
if ! kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then
|
||||
warn "kustomize-controller deployment not found; cannot use controller-stop Flux suspend finalization."
|
||||
return 1
|
||||
fi
|
||||
|
||||
warn "Stopping kustomize-controller for final Flux suspend reassertion."
|
||||
run kubectl -n flux-system scale deployment kustomize-controller --replicas=0
|
||||
wait_for_kustomize_controller_scaled_down || true
|
||||
|
||||
if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
|
||||
patch_kustomization_suspend flux-system true
|
||||
fi
|
||||
patch_recovery_optional_flux_suspend_without_snapshot true
|
||||
|
||||
run kubectl -n flux-system scale deployment kustomize-controller --replicas=1
|
||||
kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=2m || warn "kustomize-controller did not become Ready after final Flux suspend reassertion."
|
||||
sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}"
|
||||
|
||||
local unsuspended
|
||||
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
|
||||
if [[ -n "${unsuspended}" ]]; then
|
||||
warn "Flux suspend state is still not stable after controller-stop finalization: ${unsuspended}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
log "recovery-flux-suspend=verified-controller-stop"
|
||||
}
|
||||
|
||||
stabilize_recovery_flux_suspend() {
|
||||
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
||||
|
||||
@ -1061,7 +1109,7 @@ stabilize_recovery_flux_suspend() {
|
||||
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
|
||||
if [[ -n "${unsuspended}" ]]; then
|
||||
warn "Flux suspend state is still not stable after verification attempts: ${unsuspended}"
|
||||
return 1
|
||||
force_recovery_flux_suspend_with_controller_stop
|
||||
fi
|
||||
}
|
||||
|
||||
@ -1522,7 +1570,6 @@ resume_deadlock_automation_after_core_recovery() {
|
||||
restart_kustomize_controller_for_critical_thaw
|
||||
annotate_flux_kustomizations "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}" || true
|
||||
stabilize_recovery_flux_suspend || true
|
||||
restart_kustomize_controller_for_critical_thaw
|
||||
mark_checkpoint longhorn_unlock_automation_resumed
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user