recovery(ananke): thaw flux critical path first
This commit is contained in:
parent
0c2b59f7cc
commit
761e4e4964
@ -136,6 +136,10 @@ STARTUP_IGNORE_PODS_REGEX="${STARTUP_IGNORE_PODS_REGEX:-}"
|
|||||||
STARTUP_IGNORE_WORKLOADS_REGEX="${STARTUP_IGNORE_WORKLOADS_REGEX:-}"
|
STARTUP_IGNORE_WORKLOADS_REGEX="${STARTUP_IGNORE_WORKLOADS_REGEX:-}"
|
||||||
STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX="${STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system)$}"
|
STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX="${STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system)$}"
|
||||||
STARTUP_OPTIONAL_KUSTOMIZATIONS="${STARTUP_OPTIONAL_KUSTOMIZATIONS:-}"
|
STARTUP_OPTIONAL_KUSTOMIZATIONS="${STARTUP_OPTIONAL_KUSTOMIZATIONS:-}"
|
||||||
|
RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS="${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS:-ai-llm,descheduler,finance,game-stream,gitops-ui,health,jellyfin,jenkins,longhorn-ui,mailu,nextcloud,nextcloud-mail-sync,outline,planka,quality,resource-guardrails,typhon,vaultwarden,veles,wallet-monero-temp,xmr-miner}"
|
||||||
|
RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS="${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS:-flux-system,core,helm,cert-manager,longhorn-adopt,longhorn,metallb,traefik,vault-csi,vault-injector,vault,postgres,harbor,gitea,keycloak,oauth2-proxy,openldap,openclaw,monitoring,bstein-dev-home,bstein-dev-home-migrations,comms,crypto,logging,maintenance,monerod,sui-metrics}"
|
||||||
|
RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE="${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE:-${HOME}/${STATE_SUBDIR:-.local/share/ananke}/longhorn_unlock_optional_flux.tsv}"
|
||||||
|
RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER="${RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER:-1}"
|
||||||
STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS:-10}"
|
STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS:-10}"
|
||||||
STARTUP_SERVICE_CHECKLIST="${STARTUP_SERVICE_CHECKLIST:-}"
|
STARTUP_SERVICE_CHECKLIST="${STARTUP_SERVICE_CHECKLIST:-}"
|
||||||
STARTUP_INCLUDE_INGRESS_CHECKS="${STARTUP_INCLUDE_INGRESS_CHECKS:-1}"
|
STARTUP_INCLUDE_INGRESS_CHECKS="${STARTUP_INCLUDE_INGRESS_CHECKS:-1}"
|
||||||
@ -965,6 +969,74 @@ patch_kustomization_suspend() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
csv_each() {
|
||||||
|
local csv="$1"
|
||||||
|
local item
|
||||||
|
IFS=',' read -r -a _csv_items <<< "${csv}"
|
||||||
|
for item in "${_csv_items[@]}"; do
|
||||||
|
item="${item//[[:space:]]/}"
|
||||||
|
[[ -n "${item}" ]] || continue
|
||||||
|
printf '%s\n' "${item}"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
save_recovery_optional_flux_snapshot() {
|
||||||
|
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
||||||
|
mkdir -p "$(dirname "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}")"
|
||||||
|
: > "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}"
|
||||||
|
local name suspend
|
||||||
|
while IFS= read -r name; do
|
||||||
|
if ! kubectl -n flux-system get kustomization "${name}" >/dev/null 2>&1; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
suspend="$(kubectl -n flux-system get kustomization "${name}" -o jsonpath='{.spec.suspend}' 2>/dev/null || true)"
|
||||||
|
[[ -n "${suspend}" ]] || suspend="false"
|
||||||
|
printf '%s\t%s\n' "${name}" "${suspend}" >> "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}"
|
||||||
|
done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}")
|
||||||
|
log "recovery-flux-optional-snapshot=${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}"
|
||||||
|
}
|
||||||
|
|
||||||
|
patch_recovery_optional_flux_suspend() {
|
||||||
|
local value="$1"
|
||||||
|
local name
|
||||||
|
if [[ "${value}" == "true" ]]; then
|
||||||
|
save_recovery_optional_flux_snapshot
|
||||||
|
fi
|
||||||
|
while IFS= read -r name; do
|
||||||
|
patch_kustomization_suspend "${name}" "${value}"
|
||||||
|
done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}")
|
||||||
|
}
|
||||||
|
|
||||||
|
restore_recovery_optional_flux_suspend() {
|
||||||
|
[[ -f "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}" ]] || return 0
|
||||||
|
local name suspend
|
||||||
|
while IFS=$'\t' read -r name suspend; do
|
||||||
|
[[ -n "${name}" ]] || continue
|
||||||
|
[[ "${suspend}" == "true" || "${suspend}" == "false" ]] || suspend="false"
|
||||||
|
patch_kustomization_suspend "${name}" "${suspend}"
|
||||||
|
done < "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}"
|
||||||
|
}
|
||||||
|
|
||||||
|
annotate_flux_kustomizations() {
|
||||||
|
local now name
|
||||||
|
now="$(date --iso-8601=seconds)"
|
||||||
|
while IFS= read -r name; do
|
||||||
|
if kubectl -n flux-system get kustomization "${name}" >/dev/null 2>&1; then
|
||||||
|
run kubectl -n flux-system annotate kustomization "${name}" reconcile.fluxcd.io/requestedAt="${now}" --overwrite
|
||||||
|
fi
|
||||||
|
done < <(csv_each "$1")
|
||||||
|
}
|
||||||
|
|
||||||
|
restart_kustomize_controller_for_critical_thaw() {
|
||||||
|
if [[ "${RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER}" != "1" && "${RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER}" != "true" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then
|
||||||
|
warn "Restarting kustomize-controller after optional Flux suspension to clear any single-worker health-check backlog."
|
||||||
|
run kubectl -n flux-system rollout restart deployment kustomize-controller
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
patch_helmrelease_suspend() {
|
patch_helmrelease_suspend() {
|
||||||
local namespace="$1"
|
local namespace="$1"
|
||||||
local name="$2"
|
local name="$2"
|
||||||
@ -1372,13 +1444,18 @@ resume_deadlock_automation_after_core_recovery() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
patch_flux_suspend_all false
|
patch_flux_suspend_all false
|
||||||
|
patch_recovery_optional_flux_suspend true
|
||||||
if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then
|
if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then
|
||||||
run kubectl -n flux-system scale deployment kustomize-controller --replicas=1
|
run kubectl -n flux-system scale deployment kustomize-controller --replicas=1
|
||||||
fi
|
fi
|
||||||
if kubectl -n flux-system get deployment helm-controller >/dev/null 2>&1; then
|
if kubectl -n flux-system get deployment helm-controller >/dev/null 2>&1; then
|
||||||
run kubectl -n flux-system scale deployment helm-controller --replicas=1
|
run kubectl -n flux-system scale deployment helm-controller --replicas=1
|
||||||
fi
|
fi
|
||||||
trigger_flux_reconcile_all || true
|
restart_kustomize_controller_for_critical_thaw
|
||||||
|
if command -v flux >/dev/null 2>&1; then
|
||||||
|
run flux reconcile source git flux-system -n flux-system --timeout=3m || true
|
||||||
|
fi
|
||||||
|
annotate_flux_kustomizations "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}" || true
|
||||||
mark_checkpoint longhorn_unlock_automation_resumed
|
mark_checkpoint longhorn_unlock_automation_resumed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user