diff --git a/clusters/atlas/flux-system/gotk-sync.yaml b/clusters/atlas/flux-system/gotk-sync.yaml index 11de2e4b..b9d38c2c 100644 --- a/clusters/atlas/flux-system/gotk-sync.yaml +++ b/clusters/atlas/flux-system/gotk-sync.yaml @@ -21,7 +21,6 @@ metadata: namespace: flux-system spec: interval: 1h0m0s - suspend: true path: ./clusters/atlas/flux-system prune: true sourceRef: diff --git a/scripts/cluster_power_recovery.sh b/scripts/cluster_power_recovery.sh index 2e098506..e7f8aa75 100755 --- a/scripts/cluster_power_recovery.sh +++ b/scripts/cluster_power_recovery.sh @@ -136,12 +136,13 @@ STARTUP_IGNORE_PODS_REGEX="${STARTUP_IGNORE_PODS_REGEX:-}" STARTUP_IGNORE_WORKLOADS_REGEX="${STARTUP_IGNORE_WORKLOADS_REGEX:-}" STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX="${STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system)$}" STARTUP_OPTIONAL_KUSTOMIZATIONS="${STARTUP_OPTIONAL_KUSTOMIZATIONS:-}" -RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS="${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS:-ai-llm,descheduler,finance,game-stream,gitops-ui,health,jellyfin,jenkins,longhorn-ui,mailu,nextcloud,nextcloud-mail-sync,outline,planka,quality,resource-guardrails,typhon,vaultwarden,veles,wallet-monero-temp,xmr-miner}" -RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS="${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS:-core,helm,cert-manager,longhorn-adopt,longhorn,metallb,traefik,vault-csi,vault-injector,vault,postgres,harbor,gitea,keycloak,oauth2-proxy,openldap,openclaw,monitoring,bstein-dev-home,bstein-dev-home-migrations,comms,crypto,logging,maintenance,monerod,sui-metrics}" +RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS="${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS:-ai-llm,bstein-dev-home-migrations,descheduler,finance,game-stream,gitops-ui,health,jellyfin,jenkins,longhorn-ui,mailu,nextcloud,nextcloud-mail-sync,outline,planka,quality,resource-guardrails,typhon,vaultwarden,veles,wallet-monero-temp,xmr-miner}" +RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS="${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS:-core,helm,cert-manager,longhorn-adopt,longhorn,metallb,traefik,vault-csi,vault-injector,vault,postgres,harbor,gitea,keycloak,oauth2-proxy,openldap,openclaw,monitoring,bstein-dev-home,comms,crypto,logging,maintenance,monerod,sui-metrics}" RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE="${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE:-${HOME}/${STATE_SUBDIR:-.local/share/ananke}/longhorn_unlock_optional_flux.tsv}" RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER="${RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER:-1}" RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION="${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION:-1}" RECOVERY_FLUX_APPLY_BOOTSTRAP_KUSTOMIZATION="${RECOVERY_FLUX_APPLY_BOOTSTRAP_KUSTOMIZATION:-1}" +RECOVERY_FLUX_ROOT_APPLY_TIMEOUT="${RECOVERY_FLUX_ROOT_APPLY_TIMEOUT:-15m}" RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS="${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS:-6}" RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS="${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS:-10}" STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS:-10}" @@ -1019,6 +1020,14 @@ patch_recovery_optional_flux_suspend_without_snapshot() { done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}") } +patch_recovery_critical_flux_suspend() { + local value="$1" + local name + while IFS= read -r name; do + patch_kustomization_suspend "${name}" "${value}" + done < <(csv_each "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}") +} + recovery_flux_unsuspended_list() { local names=() local name suspend @@ -1144,6 +1153,25 @@ restart_kustomize_controller_for_critical_thaw() { fi } +prepare_recovery_flux_root_apply_window() { + [[ "${EXECUTE}" -eq 1 ]] || return 0 + + if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then + warn "Stopping kustomize-controller to create a quiet Flux root-apply window." + run kubectl -n flux-system scale deployment kustomize-controller --replicas=0 + wait_for_kustomize_controller_scaled_down || true + fi + + patch_recovery_optional_flux_suspend true + patch_flux_suspend_all true + patch_kustomization_suspend flux-system false + + if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then + run kubectl -n flux-system scale deployment kustomize-controller --replicas=1 + kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=2m || warn "kustomize-controller did not become Ready for Flux root apply." + fi +} + patch_helmrelease_suspend() { local namespace="$1" local name="$2" @@ -1550,25 +1578,21 @@ resume_deadlock_automation_after_core_recovery() { return 1 fi - patch_flux_suspend_all false - patch_recovery_optional_flux_suspend true - if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then - run kubectl -n flux-system scale deployment kustomize-controller --replicas=1 - fi + prepare_recovery_flux_root_apply_window if kubectl -n flux-system get deployment helm-controller >/dev/null 2>&1; then run kubectl -n flux-system scale deployment helm-controller --replicas=1 fi if command -v flux >/dev/null 2>&1; then run flux reconcile source git flux-system -n flux-system --timeout=3m || true if [[ "${RECOVERY_FLUX_APPLY_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_APPLY_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then - run flux reconcile kustomization flux-system -n flux-system --timeout=5m || warn "flux-system Kustomization did not apply the recovery source revision before final suspension." + run flux reconcile kustomization flux-system -n flux-system --timeout="${RECOVERY_FLUX_ROOT_APPLY_TIMEOUT}" || warn "flux-system Kustomization did not apply the recovery source revision before final suspension." fi fi + patch_recovery_critical_flux_suspend false if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then patch_kustomization_suspend flux-system true fi patch_recovery_optional_flux_suspend true - restart_kustomize_controller_for_critical_thaw annotate_flux_kustomizations "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}" || true stabilize_recovery_flux_suspend || true mark_checkpoint longhorn_unlock_automation_resumed