diff --git a/scripts/cluster_power_recovery.sh b/scripts/cluster_power_recovery.sh index fedec2ab..0b9bbd6c 100755 --- a/scripts/cluster_power_recovery.sh +++ b/scripts/cluster_power_recovery.sh @@ -136,6 +136,10 @@ STARTUP_IGNORE_PODS_REGEX="${STARTUP_IGNORE_PODS_REGEX:-}" STARTUP_IGNORE_WORKLOADS_REGEX="${STARTUP_IGNORE_WORKLOADS_REGEX:-}" STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX="${STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system)$}" STARTUP_OPTIONAL_KUSTOMIZATIONS="${STARTUP_OPTIONAL_KUSTOMIZATIONS:-}" +RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS="${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS:-ai-llm,descheduler,finance,game-stream,gitops-ui,health,jellyfin,jenkins,longhorn-ui,mailu,nextcloud,nextcloud-mail-sync,outline,planka,quality,resource-guardrails,typhon,vaultwarden,veles,wallet-monero-temp,xmr-miner}" +RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS="${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS:-flux-system,core,helm,cert-manager,longhorn-adopt,longhorn,metallb,traefik,vault-csi,vault-injector,vault,postgres,harbor,gitea,keycloak,oauth2-proxy,openldap,openclaw,monitoring,bstein-dev-home,bstein-dev-home-migrations,comms,crypto,logging,maintenance,monerod,sui-metrics}" +RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE="${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE:-${HOME}/${STATE_SUBDIR:-.local/share/ananke}/longhorn_unlock_optional_flux.tsv}" +RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER="${RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER:-1}" STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS:-10}" STARTUP_SERVICE_CHECKLIST="${STARTUP_SERVICE_CHECKLIST:-}" STARTUP_INCLUDE_INGRESS_CHECKS="${STARTUP_INCLUDE_INGRESS_CHECKS:-1}" @@ -965,6 +969,74 @@ patch_kustomization_suspend() { fi } +csv_each() { + local csv="$1" + local item + IFS=',' read -r -a _csv_items <<< "${csv}" + for item in "${_csv_items[@]}"; do + item="${item//[[:space:]]/}" + [[ -n "${item}" ]] || continue + printf '%s\n' "${item}" + done +} + +save_recovery_optional_flux_snapshot() { + [[ "${EXECUTE}" -eq 1 ]] || return 0 + mkdir -p "$(dirname "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}")" + : > "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}" + local name suspend + while IFS= read -r name; do + if ! kubectl -n flux-system get kustomization "${name}" >/dev/null 2>&1; then + continue + fi + suspend="$(kubectl -n flux-system get kustomization "${name}" -o jsonpath='{.spec.suspend}' 2>/dev/null || true)" + [[ -n "${suspend}" ]] || suspend="false" + printf '%s\t%s\n' "${name}" "${suspend}" >> "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}" + done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}") + log "recovery-flux-optional-snapshot=${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}" +} + +patch_recovery_optional_flux_suspend() { + local value="$1" + local name + if [[ "${value}" == "true" ]]; then + save_recovery_optional_flux_snapshot + fi + while IFS= read -r name; do + patch_kustomization_suspend "${name}" "${value}" + done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}") +} + +restore_recovery_optional_flux_suspend() { + [[ -f "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}" ]] || return 0 + local name suspend + while IFS=$'\t' read -r name suspend; do + [[ -n "${name}" ]] || continue + [[ "${suspend}" == "true" || "${suspend}" == "false" ]] || suspend="false" + patch_kustomization_suspend "${name}" "${suspend}" + done < "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}" +} + +annotate_flux_kustomizations() { + local now name + now="$(date --iso-8601=seconds)" + while IFS= read -r name; do + if kubectl -n flux-system get kustomization "${name}" >/dev/null 2>&1; then + run kubectl -n flux-system annotate kustomization "${name}" reconcile.fluxcd.io/requestedAt="${now}" --overwrite + fi + done < <(csv_each "$1") +} + +restart_kustomize_controller_for_critical_thaw() { + if [[ "${RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER}" != "1" && "${RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER}" != "true" ]]; then + return 0 + fi + if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then + warn "Restarting kustomize-controller after optional Flux suspension to clear any single-worker health-check backlog." + run kubectl -n flux-system rollout restart deployment kustomize-controller + fi +} + patch_helmrelease_suspend() { local namespace="$1" local name="$2" @@ -1372,13 +1444,18 @@ resume_deadlock_automation_after_core_recovery() { fi patch_flux_suspend_all false + patch_recovery_optional_flux_suspend true if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then run kubectl -n flux-system scale deployment kustomize-controller --replicas=1 fi if kubectl -n flux-system get deployment helm-controller >/dev/null 2>&1; then run kubectl -n flux-system scale deployment helm-controller --replicas=1 fi - trigger_flux_reconcile_all || true + restart_kustomize_controller_for_critical_thaw + if command -v flux >/dev/null 2>&1; then + run flux reconcile source git flux-system -n flux-system --timeout=3m || true + fi + annotate_flux_kustomizations "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}" || true mark_checkpoint longhorn_unlock_automation_resumed }