recovery(ananke): verify flux suspension during thaw
This commit is contained in:
parent
bb07f1598f
commit
e893af2a55
@ -141,6 +141,8 @@ RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS="${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS:-
|
|||||||
RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE="${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE:-${HOME}/${STATE_SUBDIR:-.local/share/ananke}/longhorn_unlock_optional_flux.tsv}"
|
RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE="${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE:-${HOME}/${STATE_SUBDIR:-.local/share/ananke}/longhorn_unlock_optional_flux.tsv}"
|
||||||
RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER="${RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER:-1}"
|
RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER="${RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER:-1}"
|
||||||
RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION="${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION:-1}"
|
RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION="${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION:-1}"
|
||||||
|
RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS="${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS:-6}"
|
||||||
|
RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS="${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS:-10}"
|
||||||
STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS:-10}"
|
STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS:-10}"
|
||||||
STARTUP_SERVICE_CHECKLIST="${STARTUP_SERVICE_CHECKLIST:-}"
|
STARTUP_SERVICE_CHECKLIST="${STARTUP_SERVICE_CHECKLIST:-}"
|
||||||
STARTUP_INCLUDE_INGRESS_CHECKS="${STARTUP_INCLUDE_INGRESS_CHECKS:-1}"
|
STARTUP_INCLUDE_INGRESS_CHECKS="${STARTUP_INCLUDE_INGRESS_CHECKS:-1}"
|
||||||
@ -1008,6 +1010,61 @@ patch_recovery_optional_flux_suspend() {
|
|||||||
done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}")
|
done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
patch_recovery_optional_flux_suspend_without_snapshot() {
|
||||||
|
local value="$1"
|
||||||
|
local name
|
||||||
|
while IFS= read -r name; do
|
||||||
|
patch_kustomization_suspend "${name}" "${value}"
|
||||||
|
done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}")
|
||||||
|
}
|
||||||
|
|
||||||
|
recovery_flux_unsuspended_list() {
|
||||||
|
local names=()
|
||||||
|
local name suspend
|
||||||
|
if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
|
||||||
|
names+=("flux-system")
|
||||||
|
fi
|
||||||
|
while IFS= read -r name; do
|
||||||
|
names+=("${name}")
|
||||||
|
done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}")
|
||||||
|
|
||||||
|
for name in "${names[@]}"; do
|
||||||
|
if ! kubectl -n flux-system get kustomization "${name}" >/dev/null 2>&1; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
suspend="$(kubectl -n flux-system get kustomization "${name}" -o jsonpath='{.spec.suspend}' 2>/dev/null || true)"
|
||||||
|
if [[ "${suspend}" != "true" ]]; then
|
||||||
|
printf '%s\n' "${name}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
stabilize_recovery_flux_suspend() {
|
||||||
|
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
||||||
|
|
||||||
|
local attempt unsuspended
|
||||||
|
for attempt in $(seq 1 "${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS}"); do
|
||||||
|
if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
|
||||||
|
patch_kustomization_suspend flux-system true
|
||||||
|
fi
|
||||||
|
patch_recovery_optional_flux_suspend_without_snapshot true
|
||||||
|
sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}"
|
||||||
|
|
||||||
|
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
|
||||||
|
if [[ -z "${unsuspended}" ]]; then
|
||||||
|
log "recovery-flux-suspend=verified attempts=${attempt}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
warn "Flux suspend state was overwritten during recovery thaw; reasserting attempt ${attempt}/${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS}: ${unsuspended}"
|
||||||
|
done
|
||||||
|
|
||||||
|
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
|
||||||
|
if [[ -n "${unsuspended}" ]]; then
|
||||||
|
warn "Flux suspend state is still not stable after verification attempts: ${unsuspended}"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
restore_recovery_optional_flux_suspend() {
|
restore_recovery_optional_flux_suspend() {
|
||||||
[[ -f "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}" ]] || return 0
|
[[ -f "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}" ]] || return 0
|
||||||
local name suspend
|
local name suspend
|
||||||
@ -1464,6 +1521,8 @@ resume_deadlock_automation_after_core_recovery() {
|
|||||||
patch_recovery_optional_flux_suspend true
|
patch_recovery_optional_flux_suspend true
|
||||||
restart_kustomize_controller_for_critical_thaw
|
restart_kustomize_controller_for_critical_thaw
|
||||||
annotate_flux_kustomizations "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}" || true
|
annotate_flux_kustomizations "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}" || true
|
||||||
|
stabilize_recovery_flux_suspend || true
|
||||||
|
restart_kustomize_controller_for_critical_thaw
|
||||||
mark_checkpoint longhorn_unlock_automation_resumed
|
mark_checkpoint longhorn_unlock_automation_resumed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user