recovery(ananke): keep flux holds and place metrics on longhorn nodes
This commit is contained in:
parent
6b777b8c74
commit
a9ddc80e36
@ -16,7 +16,7 @@ fi
|
|||||||
usage() {
|
usage() {
|
||||||
cat <<USAGE
|
cat <<USAGE
|
||||||
Usage:
|
Usage:
|
||||||
scripts/cluster_power_recovery.sh <prepare|status|bootstrap-seed|harbor-seed|longhorn-seed|longhorn-unlock|shutdown|startup> [options]
|
scripts/cluster_power_recovery.sh <prepare|status|bootstrap-seed|harbor-seed|longhorn-seed|longhorn-unlock|flux-hold|shutdown|startup> [options]
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--execute Actually run commands (default is dry-run)
|
--execute Actually run commands (default is dry-run)
|
||||||
@ -81,6 +81,7 @@ Examples:
|
|||||||
scripts/cluster_power_recovery.sh bootstrap-seed --execute
|
scripts/cluster_power_recovery.sh bootstrap-seed --execute
|
||||||
scripts/cluster_power_recovery.sh harbor-seed --execute
|
scripts/cluster_power_recovery.sh harbor-seed --execute
|
||||||
scripts/cluster_power_recovery.sh longhorn-unlock --execute
|
scripts/cluster_power_recovery.sh longhorn-unlock --execute
|
||||||
|
scripts/cluster_power_recovery.sh flux-hold --execute
|
||||||
scripts/cluster_power_recovery.sh status
|
scripts/cluster_power_recovery.sh status
|
||||||
scripts/cluster_power_recovery.sh shutdown --execute
|
scripts/cluster_power_recovery.sh shutdown --execute
|
||||||
scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main
|
scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main
|
||||||
@ -95,7 +96,7 @@ fi
|
|||||||
shift || true
|
shift || true
|
||||||
|
|
||||||
case "${MODE}" in
|
case "${MODE}" in
|
||||||
prepare|status|bootstrap-seed|harbor-seed|longhorn-seed|longhorn-unlock|shutdown|startup) ;;
|
prepare|status|bootstrap-seed|harbor-seed|longhorn-seed|longhorn-unlock|flux-hold|shutdown|startup) ;;
|
||||||
*)
|
*)
|
||||||
echo "Unknown mode: ${MODE}" >&2
|
echo "Unknown mode: ${MODE}" >&2
|
||||||
usage
|
usage
|
||||||
@ -147,6 +148,14 @@ RECOVERY_FLUX_ROOT_APPLY_TIMEOUT="${RECOVERY_FLUX_ROOT_APPLY_TIMEOUT:-15m}"
|
|||||||
RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS="${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS:-6}"
|
RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS="${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS:-6}"
|
||||||
RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS="${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS:-10}"
|
RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS="${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS:-10}"
|
||||||
RECOVERY_FLUX_FINAL_RESTART_KUSTOMIZE_CONTROLLER="${RECOVERY_FLUX_FINAL_RESTART_KUSTOMIZE_CONTROLLER:-0}"
|
RECOVERY_FLUX_FINAL_RESTART_KUSTOMIZE_CONTROLLER="${RECOVERY_FLUX_FINAL_RESTART_KUSTOMIZE_CONTROLLER:-0}"
|
||||||
|
RECOVERY_FLUX_PATCH_VERIFY_ATTEMPTS="${RECOVERY_FLUX_PATCH_VERIFY_ATTEMPTS:-3}"
|
||||||
|
RECOVERY_FLUX_PATCH_VERIFY_SLEEP_SECONDS="${RECOVERY_FLUX_PATCH_VERIFY_SLEEP_SECONDS:-1}"
|
||||||
|
RECOVERY_FLUX_FINAL_STABILITY_SECONDS="${RECOVERY_FLUX_FINAL_STABILITY_SECONDS:-45}"
|
||||||
|
RECOVERY_FLUX_FINAL_STABILITY_POLL_SECONDS="${RECOVERY_FLUX_FINAL_STABILITY_POLL_SECONDS:-5}"
|
||||||
|
RECOVERY_FLUX_FINAL_STABILITY_TIMEOUT_SECONDS="${RECOVERY_FLUX_FINAL_STABILITY_TIMEOUT_SECONDS:-300}"
|
||||||
|
RECOVERY_FLUX_FINAL_STOP_AUX_CONTROLLERS="${RECOVERY_FLUX_FINAL_STOP_AUX_CONTROLLERS:-1}"
|
||||||
|
RECOVERY_FLUX_AUX_CONTROLLERS="${RECOVERY_FLUX_AUX_CONTROLLERS:-image-automation-controller,image-reflector-controller,notification-controller}"
|
||||||
|
RECOVERY_KUBECTL_FIELD_MANAGER="${RECOVERY_KUBECTL_FIELD_MANAGER:-ananke-recovery-hold}"
|
||||||
STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS:-10}"
|
STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS:-10}"
|
||||||
STARTUP_SERVICE_CHECKLIST="${STARTUP_SERVICE_CHECKLIST:-}"
|
STARTUP_SERVICE_CHECKLIST="${STARTUP_SERVICE_CHECKLIST:-}"
|
||||||
STARTUP_INCLUDE_INGRESS_CHECKS="${STARTUP_INCLUDE_INGRESS_CHECKS:-1}"
|
STARTUP_INCLUDE_INGRESS_CHECKS="${STARTUP_INCLUDE_INGRESS_CHECKS:-1}"
|
||||||
@ -953,24 +962,44 @@ patch_flux_suspend_all() {
|
|||||||
|
|
||||||
while IFS= read -r k; do
|
while IFS= read -r k; do
|
||||||
[[ -z "${k}" ]] && continue
|
[[ -z "${k}" ]] && continue
|
||||||
run kubectl -n flux-system patch kustomization "${k}" --type=merge -p "${patch}"
|
patch_kustomization_suspend "${k}" "${value}"
|
||||||
done <<< "${ks_list}"
|
done <<< "${ks_list}"
|
||||||
|
|
||||||
while IFS= read -r hr; do
|
while IFS= read -r hr; do
|
||||||
[[ -z "${hr}" ]] && continue
|
[[ -z "${hr}" ]] && continue
|
||||||
local ns="${hr%%/*}"
|
local ns="${hr%%/*}"
|
||||||
local name="${hr##*/}"
|
local name="${hr##*/}"
|
||||||
run kubectl -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}"
|
run kubectl -n "${ns}" patch helmrelease "${name}" --field-manager="${RECOVERY_KUBECTL_FIELD_MANAGER}" --type=merge -p "${patch}"
|
||||||
done <<< "${hr_list}"
|
done <<< "${hr_list}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
apply_kustomization_suspend_field() {
|
||||||
|
local name="$1"
|
||||||
|
local value="$2"
|
||||||
|
|
||||||
|
if [[ "${EXECUTE}" -eq 1 ]]; then
|
||||||
|
log "EXEC: kubectl apply --server-side --force-conflicts --field-manager=${RECOVERY_KUBECTL_FIELD_MANAGER} kustomization/${name} suspend=${value}"
|
||||||
|
printf 'apiVersion: kustomize.toolkit.fluxcd.io/v1\nkind: Kustomization\nmetadata:\n name: %s\n namespace: flux-system\nspec:\n suspend: %s\n' "${name}" "${value}" \
|
||||||
|
| kubectl apply --server-side --force-conflicts --field-manager="${RECOVERY_KUBECTL_FIELD_MANAGER}" -f -
|
||||||
|
else
|
||||||
|
log "DRY-RUN: kubectl apply --server-side --force-conflicts --field-manager=${RECOVERY_KUBECTL_FIELD_MANAGER} kustomization/${name} suspend=${value}"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
patch_kustomization_suspend() {
|
patch_kustomization_suspend() {
|
||||||
local name="$1"
|
local name="$1"
|
||||||
local value="$2"
|
local value="$2"
|
||||||
local patch
|
|
||||||
patch=$(printf '{"spec":{"suspend":%s}}' "${value}")
|
|
||||||
if kubectl -n flux-system get kustomization "${name}" >/dev/null 2>&1; then
|
if kubectl -n flux-system get kustomization "${name}" >/dev/null 2>&1; then
|
||||||
run kubectl -n flux-system patch kustomization "${name}" --type=merge -p "${patch}"
|
apply_kustomization_suspend_field "${name}" "${value}"
|
||||||
|
if [[ "${EXECUTE}" -eq 1 && "${value}" == "true" ]]; then
|
||||||
|
local attempt observed
|
||||||
|
for attempt in $(seq 1 "${RECOVERY_FLUX_PATCH_VERIFY_ATTEMPTS}"); do
|
||||||
|
observed="$(kubectl -n flux-system get kustomization "${name}" -o jsonpath='{.spec.suspend}' 2>/dev/null || true)"
|
||||||
|
[[ "${observed}" == "true" ]] && return 0
|
||||||
|
sleep "${RECOVERY_FLUX_PATCH_VERIFY_SLEEP_SECONDS}"
|
||||||
|
done
|
||||||
|
warn "Flux Kustomization ${name} suspend=true did not verify after patch; observed=${observed:-missing}."
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
warn "Flux Kustomization ${name} not found; skipping suspend=${value}."
|
warn "Flux Kustomization ${name} not found; skipping suspend=${value}."
|
||||||
fi
|
fi
|
||||||
@ -1062,6 +1091,61 @@ recovery_flux_unsuspended_list() {
|
|||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
|
reassert_recovery_flux_suspend_hold() {
|
||||||
|
if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
|
||||||
|
patch_kustomization_suspend flux-system true
|
||||||
|
fi
|
||||||
|
patch_recovery_optional_flux_suspend_without_snapshot true
|
||||||
|
}
|
||||||
|
|
||||||
|
verify_recovery_flux_suspend_stable_window() {
|
||||||
|
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
||||||
|
|
||||||
|
local deadline stable_since stable_for unsuspended
|
||||||
|
deadline=$((SECONDS + RECOVERY_FLUX_FINAL_STABILITY_TIMEOUT_SECONDS))
|
||||||
|
stable_since=0
|
||||||
|
|
||||||
|
reassert_recovery_flux_suspend_hold
|
||||||
|
|
||||||
|
while (( SECONDS < deadline )); do
|
||||||
|
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
|
||||||
|
if [[ -n "${unsuspended}" ]]; then
|
||||||
|
warn "Flux suspend hold was overwritten; reasserting recovery hold: ${unsuspended}"
|
||||||
|
reassert_recovery_flux_suspend_hold
|
||||||
|
stable_since=0
|
||||||
|
sleep "${RECOVERY_FLUX_FINAL_STABILITY_POLL_SECONDS}"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( stable_since == 0 )); then
|
||||||
|
stable_since="${SECONDS}"
|
||||||
|
fi
|
||||||
|
stable_for=$((SECONDS - stable_since))
|
||||||
|
if (( stable_for >= RECOVERY_FLUX_FINAL_STABILITY_SECONDS )); then
|
||||||
|
log "recovery-flux-suspend=stable seconds=${stable_for}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
sleep "${RECOVERY_FLUX_FINAL_STABILITY_POLL_SECONDS}"
|
||||||
|
done
|
||||||
|
|
||||||
|
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
|
||||||
|
warn "Timed out waiting for stable Flux suspend hold after ${RECOVERY_FLUX_FINAL_STABILITY_TIMEOUT_SECONDS}s: ${unsuspended:-none-unsuspended-now}"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_recovery_flux_aux_controllers() {
|
||||||
|
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
||||||
|
[[ "${RECOVERY_FLUX_FINAL_STOP_AUX_CONTROLLERS}" == "1" || "${RECOVERY_FLUX_FINAL_STOP_AUX_CONTROLLERS}" == "true" ]] || return 0
|
||||||
|
|
||||||
|
local controller
|
||||||
|
while IFS= read -r controller; do
|
||||||
|
if kubectl -n flux-system get deployment "${controller}" >/dev/null 2>&1; then
|
||||||
|
warn "Stopping Flux auxiliary controller ${controller} for recovery hold."
|
||||||
|
run kubectl -n flux-system scale deployment "${controller}" --replicas=0
|
||||||
|
fi
|
||||||
|
done < <(csv_each "${RECOVERY_FLUX_AUX_CONTROLLERS}")
|
||||||
|
}
|
||||||
|
|
||||||
wait_for_kustomize_controller_scaled_down() {
|
wait_for_kustomize_controller_scaled_down() {
|
||||||
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
||||||
|
|
||||||
@ -1089,12 +1173,10 @@ force_recovery_flux_suspend_with_controller_stop() {
|
|||||||
|
|
||||||
warn "Stopping kustomize-controller for final Flux suspend reassertion."
|
warn "Stopping kustomize-controller for final Flux suspend reassertion."
|
||||||
run kubectl -n flux-system scale deployment kustomize-controller --replicas=0
|
run kubectl -n flux-system scale deployment kustomize-controller --replicas=0
|
||||||
|
stop_recovery_flux_aux_controllers
|
||||||
wait_for_kustomize_controller_scaled_down || true
|
wait_for_kustomize_controller_scaled_down || true
|
||||||
|
|
||||||
if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
|
reassert_recovery_flux_suspend_hold
|
||||||
patch_kustomization_suspend flux-system true
|
|
||||||
fi
|
|
||||||
patch_recovery_optional_flux_suspend_without_snapshot true
|
|
||||||
|
|
||||||
if [[ "${RECOVERY_FLUX_FINAL_RESTART_KUSTOMIZE_CONTROLLER}" == "1" || "${RECOVERY_FLUX_FINAL_RESTART_KUSTOMIZE_CONTROLLER}" == "true" ]]; then
|
if [[ "${RECOVERY_FLUX_FINAL_RESTART_KUSTOMIZE_CONTROLLER}" == "1" || "${RECOVERY_FLUX_FINAL_RESTART_KUSTOMIZE_CONTROLLER}" == "true" ]]; then
|
||||||
run kubectl -n flux-system scale deployment kustomize-controller --replicas=1
|
run kubectl -n flux-system scale deployment kustomize-controller --replicas=1
|
||||||
@ -1102,19 +1184,9 @@ force_recovery_flux_suspend_with_controller_stop() {
|
|||||||
sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}"
|
sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}"
|
||||||
else
|
else
|
||||||
warn "Leaving kustomize-controller stopped to preserve the recovery Flux hold."
|
warn "Leaving kustomize-controller stopped to preserve the recovery Flux hold."
|
||||||
sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}"
|
|
||||||
if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
|
|
||||||
patch_kustomization_suspend flux-system true
|
|
||||||
fi
|
|
||||||
patch_recovery_optional_flux_suspend_without_snapshot true
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
local unsuspended
|
verify_recovery_flux_suspend_stable_window || return 1
|
||||||
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
|
|
||||||
if [[ -n "${unsuspended}" ]]; then
|
|
||||||
warn "Flux suspend state is still not stable after controller-stop finalization: ${unsuspended}"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "recovery-flux-suspend=verified-controller-stop"
|
log "recovery-flux-suspend=verified-controller-stop"
|
||||||
}
|
}
|
||||||
@ -1124,16 +1196,15 @@ stabilize_recovery_flux_suspend() {
|
|||||||
|
|
||||||
local attempt unsuspended
|
local attempt unsuspended
|
||||||
for attempt in $(seq 1 "${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS}"); do
|
for attempt in $(seq 1 "${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS}"); do
|
||||||
if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
|
reassert_recovery_flux_suspend_hold
|
||||||
patch_kustomization_suspend flux-system true
|
|
||||||
fi
|
|
||||||
patch_recovery_optional_flux_suspend_without_snapshot true
|
|
||||||
sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}"
|
sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}"
|
||||||
|
|
||||||
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
|
unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
|
||||||
if [[ -z "${unsuspended}" ]]; then
|
if [[ -z "${unsuspended}" ]]; then
|
||||||
log "recovery-flux-suspend=verified attempts=${attempt}"
|
verify_recovery_flux_suspend_stable_window && {
|
||||||
return 0
|
log "recovery-flux-suspend=verified attempts=${attempt}"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
fi
|
fi
|
||||||
warn "Flux suspend state was overwritten during recovery thaw; reasserting attempt ${attempt}/${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS}: ${unsuspended}"
|
warn "Flux suspend state was overwritten during recovery thaw; reasserting attempt ${attempt}/${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS}: ${unsuspended}"
|
||||||
done
|
done
|
||||||
@ -1211,7 +1282,7 @@ patch_helmrelease_suspend() {
|
|||||||
local patch
|
local patch
|
||||||
patch=$(printf '{"spec":{"suspend":%s}}' "${value}")
|
patch=$(printf '{"spec":{"suspend":%s}}' "${value}")
|
||||||
if kubectl -n "${namespace}" get helmrelease "${name}" >/dev/null 2>&1; then
|
if kubectl -n "${namespace}" get helmrelease "${name}" >/dev/null 2>&1; then
|
||||||
run kubectl -n "${namespace}" patch helmrelease "${name}" --type=merge -p "${patch}"
|
run kubectl -n "${namespace}" patch helmrelease "${name}" --field-manager="${RECOVERY_KUBECTL_FIELD_MANAGER}" --type=merge -p "${patch}"
|
||||||
else
|
else
|
||||||
warn "HelmRelease ${namespace}/${name} not found; skipping suspend=${value}."
|
warn "HelmRelease ${namespace}/${name} not found; skipping suspend=${value}."
|
||||||
fi
|
fi
|
||||||
@ -3318,6 +3389,9 @@ case "${MODE}" in
|
|||||||
longhorn-unlock)
|
longhorn-unlock)
|
||||||
longhorn_unlock_flow
|
longhorn_unlock_flow
|
||||||
;;
|
;;
|
||||||
|
flux-hold)
|
||||||
|
force_recovery_flux_suspend_with_controller_stop
|
||||||
|
;;
|
||||||
shutdown)
|
shutdown)
|
||||||
planned_shutdown
|
planned_shutdown
|
||||||
;;
|
;;
|
||||||
|
|||||||
@ -75,6 +75,8 @@ metadata:
|
|||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
spec:
|
spec:
|
||||||
interval: 15m
|
interval: 15m
|
||||||
|
upgrade:
|
||||||
|
disableWait: true
|
||||||
chart:
|
chart:
|
||||||
spec:
|
spec:
|
||||||
chart: victoria-metrics-single
|
chart: victoria-metrics-single
|
||||||
@ -98,6 +100,10 @@ spec:
|
|||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
nodeSelectorTerms:
|
nodeSelectorTerms:
|
||||||
- matchExpressions:
|
- matchExpressions:
|
||||||
|
- key: longhorn-host
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- "true"
|
||||||
- key: kubernetes.io/hostname
|
- key: kubernetes.io/hostname
|
||||||
operator: NotIn
|
operator: NotIn
|
||||||
values:
|
values:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user