recovery(ananke): finalize flux holds without races
This commit is contained in:
parent
e893af2a55
commit
32681728c0
@ -1039,6 +1039,54 @@ recovery_flux_unsuspended_list() {
|
|||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
|
wait_for_kustomize_controller_scaled_down() {
|
||||||
|
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
||||||
|
|
||||||
|
local deadline pods
|
||||||
|
deadline=$((SECONDS + 90))
|
||||||
|
while (( SECONDS < deadline )); do
|
||||||
|
pods="$(kubectl -n flux-system get pods -l app=kustomize-controller -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
|
||||||
|
if [[ -z "${pods//[[:space:]]/}" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
warn "Timed out waiting for kustomize-controller pods to terminate before final Flux suspend reassertion."
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
force_recovery_flux_suspend_with_controller_stop() {
|
||||||
|
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
||||||
|
|
||||||
|
if ! kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then
|
||||||
|
warn "kustomize-controller deployment not found; cannot use controller-stop Flux suspend finalization."
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
warn "Stopping kustomize-controller for final Flux suspend reassertion."
|
||||||
|
run kubectl -n flux-system scale deployment kustomize-controller --replicas=0
|
||||||
|
wait_for_kustomize_controller_scaled_down || true
|
||||||
|
|
||||||
|
if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
|
||||||
|
patch_kustomization_suspend flux-system true
|
||||||
|
fi
|
||||||
|
patch_recovery_optional_flux_suspend_without_snapshot true
|
||||||
|
|
||||||
|
run kubectl -n flux-system scale deployment kustomize-controller --replicas=1
|
||||||
|
kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=2m || warn "kustomize-controller did not become Ready after final Flux suspend reassertion."
|
||||||
|
sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}"
|
||||||
|
|
||||||
|
local unsuspended
|
||||||
|
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
|
||||||
|
if [[ -n "${unsuspended}" ]]; then
|
||||||
|
warn "Flux suspend state is still not stable after controller-stop finalization: ${unsuspended}"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "recovery-flux-suspend=verified-controller-stop"
|
||||||
|
}
|
||||||
|
|
||||||
stabilize_recovery_flux_suspend() {
|
stabilize_recovery_flux_suspend() {
|
||||||
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
||||||
|
|
||||||
@ -1061,7 +1109,7 @@ stabilize_recovery_flux_suspend() {
|
|||||||
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
|
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
|
||||||
if [[ -n "${unsuspended}" ]]; then
|
if [[ -n "${unsuspended}" ]]; then
|
||||||
warn "Flux suspend state is still not stable after verification attempts: ${unsuspended}"
|
warn "Flux suspend state is still not stable after verification attempts: ${unsuspended}"
|
||||||
return 1
|
force_recovery_flux_suspend_with_controller_stop
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1522,7 +1570,6 @@ resume_deadlock_automation_after_core_recovery() {
|
|||||||
restart_kustomize_controller_for_critical_thaw
|
restart_kustomize_controller_for_critical_thaw
|
||||||
annotate_flux_kustomizations "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}" || true
|
annotate_flux_kustomizations "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}" || true
|
||||||
stabilize_recovery_flux_suspend || true
|
stabilize_recovery_flux_suspend || true
|
||||||
restart_kustomize_controller_for_critical_thaw
|
|
||||||
mark_checkpoint longhorn_unlock_automation_resumed
|
mark_checkpoint longhorn_unlock_automation_resumed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user