titan-iac/scripts/cluster_power_recovery.sh

#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_DIR="${ANANKE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}"
BOOTSTRAP_DIR="${SCRIPT_DIR}/bootstrap"
CONFIG_FILE="${BOOTSTRAP_DIR}/recovery-config.env"
if [[ -f "${CONFIG_FILE}" ]]; then
  # shellcheck disable=SC1090
  source "${CONFIG_FILE}"
fi
if [[ -z "${KUBECONFIG:-}" && -f "${SCRIPT_DIR}/kubeconfig" ]]; then
  export KUBECONFIG="${SCRIPT_DIR}/kubeconfig"
fi

usage() {
  cat <<USAGE
Usage:
  scripts/cluster_power_recovery.sh <prepare|status|bootstrap-seed|harbor-seed|longhorn-seed|longhorn-unlock|shutdown|startup> [options]

Options:
  --execute                     Actually run commands (default is dry-run)
  --shutdown-mode <mode>        Shutdown behavior: host-poweroff or cluster-only (default: ${SHUTDOWN_MODE:-host-poweroff})
  --expected-flux-branch <name> Expected Flux source branch during startup checks (default: ${DEFAULT_FLUX_BRANCH:-main})
  --expected-flux-url <url>     Expected Flux source URL during startup checks
  --allow-flux-source-mutation  Required to allow --force-flux-url during startup
  --force-flux-url <url>        Startup: patch flux-system GitRepository URL to this value
  --force-flux-branch <name>    Startup: patch flux-system GitRepository branch to this value
  --skip-etcd-snapshot          Shutdown: skip etcd snapshot before shutdown
  --skip-drain                  Shutdown: skip worker drain during shutdown
  --skip-local-bootstrap        Startup: skip local bootstrap fallback applies
  --skip-harbor-bootstrap       Startup: skip Harbor recovery bootstrap stage
  --skip-harbor-seed            Startup: skip bootstrap image seed/import stage
  --skip-helper-prewarm         Prepare/Shutdown/Startup: skip node-helper prewarm
  --refresh-bootstrap-image-aliases
                               Remove bootstrap image aliases before import, to clear poisoned registry pulls
  --min-startup-battery <pct>   Minimum UPS percent required before bootstrap (default: 35)
  --ups-host <name>             UPS identifier for upsc (default: ups@localhost)
  --ups-battery-key <key>       UPS battery key for upsc (default: battery.charge)
  --recovery-state-file <path>  Recovery state file for outage-aware restart logic
  --replica-snapshot-file <path>
                               File used to persist workload replica snapshot across shutdown/startup
  --bootstrap-images-file <path>
                               Image list expected inside the bootstrap bundle
  --harbor-bundle-file <path>   Bootstrap bundle on the control host
  --longhorn-unlock-bundle-file <path>
                               Longhorn-only bundle for Harbor-deadlock recovery
  --longhorn-unlock-images-file <path>
                               Longhorn-only image list for Harbor-deadlock recovery
  --longhorn-manager-cache-bundle-file <path>
                               Single-image Longhorn manager cache repair archive
  --skip-longhorn-unlock-bundle-seed
                               Longhorn unlock: skip full Longhorn bundle seed and run surgical repairs only
  --bootstrap-bundle-arch <arch>
                               Node architecture expected by the bootstrap bundle (default: ${BOOTSTRAP_BUNDLE_ARCH:-arm64})
  --harbor-target-node <name>   Node that should host Harbor during bootstrap (default: auto)
  --harbor-canary-node <name>   Node used for Harbor pull canary (default: auto)
  --harbor-host-label-key <key> Node label key used to pin Harbor bootstrap workloads (default: ${HARBOR_HOST_LABEL_KEY:-ananke.bstein.dev/harbor-bootstrap})
  --harbor-canary-image <image> Harbor-backed image used for pull canary (default: ${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0})
  --node-helper-image <image>   Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/ananke-node-helper:0.1.0})
  --bundle-http-port <port>     Temporary HTTP port used to serve bootstrap bundles (default: ${BUNDLE_HTTP_PORT:-8877})
  --api-wait-timeout <seconds>  Startup: Kubernetes API wait timeout (default: 600)
  --drain-timeout <seconds>     Worker drain timeout for normal shutdown (default: 180)
  --emergency-drain-timeout <seconds>
                               Worker drain timeout for emergency fallback (default: 45)
  --flux-ready-timeout <seconds>
                               Startup: max time to wait for Flux kustomizations Ready (default: 1200)
  --startup-checklist-timeout <seconds>
                               Startup: max time to wait for external service checklist (default: 900)
  --startup-workload-timeout <seconds>
                               Startup: max time to wait for workload readiness checks (default: 900)
  --startup-stability-window <seconds>
                               Startup: continuous healthy window required before success (default: 180)
  --startup-stability-timeout <seconds>
                               Startup: max time allowed to achieve the healthy window (default: 900)
  --require-ups-battery         Hard-fail startup if UPS battery cannot be read
  -h, --help                    Show help

Examples:
  scripts/cluster_power_recovery.sh prepare --execute
  scripts/cluster_power_recovery.sh bootstrap-seed --execute
  scripts/cluster_power_recovery.sh harbor-seed --execute
  scripts/cluster_power_recovery.sh longhorn-unlock --execute
  scripts/cluster_power_recovery.sh status
  scripts/cluster_power_recovery.sh shutdown --execute
  scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main
USAGE
}

MODE="${1:-}"
if [[ -z "${MODE}" || "${MODE}" == "-h" || "${MODE}" == "--help" ]]; then
  usage
  exit 0
fi
shift || true

case "${MODE}" in
  prepare|status|bootstrap-seed|harbor-seed|longhorn-seed|longhorn-unlock|shutdown|startup) ;;
  *)
    echo "Unknown mode: ${MODE}" >&2
    usage
    exit 1
    ;;
esac

EXECUTE=0
SHUTDOWN_MODE="${SHUTDOWN_MODE:-host-poweroff}"
EXPECTED_FLUX_BRANCH="${DEFAULT_FLUX_BRANCH:-main}"
EXPECTED_FLUX_URL="${EXPECTED_FLUX_URL:-ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git}"
ALLOW_FLUX_SOURCE_MUTATION=0
FORCE_FLUX_URL=""
FORCE_FLUX_BRANCH=""
SKIP_ETCD_SNAPSHOT=0
SKIP_DRAIN=0
SKIP_LOCAL_BOOTSTRAP=0
SKIP_HARBOR_BOOTSTRAP=0
SKIP_HARBOR_SEED=0
SKIP_HELPER_PREWARM=0
UPS_HOST="${UPS_HOST:-ups@localhost}"
UPS_BATTERY_KEY="${UPS_BATTERY_KEY:-battery.charge}"
MIN_STARTUP_BATTERY="${MIN_STARTUP_BATTERY:-35}"
REQUIRE_UPS_BATTERY="${REQUIRE_UPS_BATTERY:-0}"
DRAIN_TIMEOUT_SECONDS=180
EMERGENCY_DRAIN_TIMEOUT_SECONDS=45
API_WAIT_TIMEOUT_SECONDS=600
FLUX_READY_TIMEOUT_SECONDS="${FLUX_READY_TIMEOUT_SECONDS:-1200}"
FLUX_READY_POLL_SECONDS="${FLUX_READY_POLL_SECONDS:-10}"
STARTUP_CHECKLIST_TIMEOUT_SECONDS="${STARTUP_CHECKLIST_TIMEOUT_SECONDS:-900}"
STARTUP_CHECKLIST_POLL_SECONDS="${STARTUP_CHECKLIST_POLL_SECONDS:-10}"
STARTUP_WORKLOAD_TIMEOUT_SECONDS="${STARTUP_WORKLOAD_TIMEOUT_SECONDS:-900}"
STARTUP_WORKLOAD_POLL_SECONDS="${STARTUP_WORKLOAD_POLL_SECONDS:-10}"
STARTUP_STABILITY_WINDOW_SECONDS="${STARTUP_STABILITY_WINDOW_SECONDS:-180}"
STARTUP_STABILITY_TIMEOUT_SECONDS="${STARTUP_STABILITY_TIMEOUT_SECONDS:-900}"
STARTUP_STABILITY_POLL_SECONDS="${STARTUP_STABILITY_POLL_SECONDS:-10}"
STARTUP_IGNORE_PODS_REGEX="${STARTUP_IGNORE_PODS_REGEX:-}"
STARTUP_IGNORE_WORKLOADS_REGEX="${STARTUP_IGNORE_WORKLOADS_REGEX:-}"
STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX="${STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system)$}"
STARTUP_OPTIONAL_KUSTOMIZATIONS="${STARTUP_OPTIONAL_KUSTOMIZATIONS:-}"
RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS="${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS:-ai-llm,bstein-dev-home-migrations,descheduler,finance,game-stream,gitops-ui,health,jellyfin,jenkins,longhorn-ui,mailu,nextcloud,nextcloud-mail-sync,outline,planka,quality,resource-guardrails,typhon,vaultwarden,veles,wallet-monero-temp,xmr-miner}"
RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS="${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS:-core,helm,cert-manager,longhorn-adopt,longhorn,metallb,traefik,vault-csi,vault-injector,vault,postgres,harbor,gitea,keycloak,oauth2-proxy,openldap,openclaw,monitoring,bstein-dev-home,comms,crypto,logging,maintenance,monerod,sui-metrics}"
RECOVERY_FLUX_CRITICAL_HELMRELEASES="${RECOVERY_FLUX_CRITICAL_HELMRELEASES:-cert-manager/cert-manager,comms/othrys-element,comms/othrys-synapse,harbor/harbor,kube-system/secrets-store-csi-driver,logging/data-prepper,logging/fluent-bit,logging/opensearch,logging/opensearch-dashboards,logging/otel-collector,longhorn-system/longhorn,metallb-system/metallb,monitoring/alertmanager,monitoring/grafana,monitoring/kube-state-metrics,monitoring/node-exporter,monitoring/victoria-metrics-single,vault/vault-injector}"
RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE="${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE:-${HOME}/${STATE_SUBDIR:-.local/share/ananke}/longhorn_unlock_optional_flux.tsv}"
RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER="${RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER:-1}"
RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION="${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION:-1}"
RECOVERY_FLUX_APPLY_BOOTSTRAP_KUSTOMIZATION="${RECOVERY_FLUX_APPLY_BOOTSTRAP_KUSTOMIZATION:-0}"
RECOVERY_FLUX_ROOT_APPLY_TIMEOUT="${RECOVERY_FLUX_ROOT_APPLY_TIMEOUT:-15m}"
RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS="${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS:-6}"
RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS="${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS:-10}"
RECOVERY_FLUX_FINAL_RESTART_KUSTOMIZE_CONTROLLER="${RECOVERY_FLUX_FINAL_RESTART_KUSTOMIZE_CONTROLLER:-0}"
STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS:-10}"
STARTUP_SERVICE_CHECKLIST="${STARTUP_SERVICE_CHECKLIST:-}"
STARTUP_INCLUDE_INGRESS_CHECKS="${STARTUP_INCLUDE_INGRESS_CHECKS:-1}"
STARTUP_INGRESS_ALLOWED_STATUSES="${STARTUP_INGRESS_ALLOWED_STATUSES:-200,301,302,307,308,401,403,404}"
STARTUP_IGNORE_INGRESS_HOSTS_REGEX="${STARTUP_IGNORE_INGRESS_HOSTS_REGEX:-}"
STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="${STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS:-10}"
SHUTDOWN_NAMESPACE_EXCLUDES_REGEX="${SHUTDOWN_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$}"
REQUIRE_NONEMPTY_REPLICA_SNAPSHOT="${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT:-1}"
STARTUP_REQUIRE_MAIL_SAFEGUARDS="${STARTUP_REQUIRE_MAIL_SAFEGUARDS:-1}"
MAIL_STARTUP_NAMESPACE="${MAIL_STARTUP_NAMESPACE:-mailu-mailserver}"
MAIL_STARTUP_ENDPOINT_SERVICES="${MAIL_STARTUP_ENDPOINT_SERVICES:-mailu-front,mailu-postfix,mailu-dovecot}"
MAIL_STARTUP_HOST="${MAIL_STARTUP_HOST:-mail.bstein.dev}"
MAIL_STARTUP_TCP_PORTS="${MAIL_STARTUP_TCP_PORTS:-25,465,587,993,995}"
MAIL_STARTUP_TCP_TIMEOUT_SECONDS="${MAIL_STARTUP_TCP_TIMEOUT_SECONDS:-3}"
BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}"
STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/ananke}"
RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state"
REPLICA_SNAPSHOT_FILE="${STATE_ROOT}/desired_workload_replicas.tsv"
HARBOR_BUNDLE_FILE="${STATE_ROOT}/bundles/${HARBOR_BUNDLE_BASENAME:-harbor-bootstrap-v2.14.1-arm64.tar.zst}"
BOOTSTRAP_IMAGES_FILE="${BOOTSTRAP_IMAGES_FILE:-${BOOTSTRAP_DIR}/harbor-bootstrap-images.txt}"
LONGHORN_UNLOCK_IMAGES_FILE="${LONGHORN_UNLOCK_IMAGES_FILE:-${BOOTSTRAP_DIR}/longhorn-unlock-images.txt}"
LONGHORN_UNLOCK_BUNDLE_FILE="${LONGHORN_UNLOCK_BUNDLE_FILE:-${STATE_ROOT}/bundles/longhorn-unlock-v1.8.2-${BOOTSTRAP_BUNDLE_ARCH:-arm64}.tar.zst}"
LONGHORN_MANAGER_IMAGE="${LONGHORN_MANAGER_IMAGE:-registry.bstein.dev/infra/longhorn-manager:v1.8.2}"
LONGHORN_MANAGER_CACHE_BUNDLE_FILE="${LONGHORN_MANAGER_CACHE_BUNDLE_FILE:-${STATE_ROOT}/bundles/longhorn-manager-v1.8.2-${BOOTSTRAP_BUNDLE_ARCH:-arm64}.tar}"
LONGHORN_UNLOCK_SSH_KNOWN_HOSTS="${LONGHORN_UNLOCK_SSH_KNOWN_HOSTS:-/tmp/ananke_longhorn_unlock_known_hosts}"
BOOTSTRAP_BUNDLE_ARCH="${BOOTSTRAP_BUNDLE_ARCH:-arm64}"
RECOVERY_UNCORDON_DENYLIST="${RECOVERY_UNCORDON_DENYLIST:-titan-18,titan-22,titan-24}"
STALE_TERMINATING_POD_SECONDS="${STALE_TERMINATING_POD_SECONDS:-300}"
RECOVERY_NODE_RUNTIME_RESTART_ENABLED="${RECOVERY_NODE_RUNTIME_RESTART_ENABLED:-1}"
RECOVERY_NODE_RUNTIME_RESTART_DENYLIST="${RECOVERY_NODE_RUNTIME_RESTART_DENYLIST:-${RECOVERY_UNCORDON_DENYLIST}}"
RECOVERY_NODE_RUNTIME_RESTART_MAX_NODES="${RECOVERY_NODE_RUNTIME_RESTART_MAX_NODES:-3}"
RECOVERY_NODE_RUNTIME_RESTART_WAIT_SECONDS="${RECOVERY_NODE_RUNTIME_RESTART_WAIT_SECONDS:-300}"
HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-}"
HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-}"
HARBOR_HOST_LABEL_KEY="${HARBOR_HOST_LABEL_KEY:-ananke.bstein.dev/harbor-bootstrap}"
HARBOR_CANARY_IMAGE="${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}"
NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/ananke-node-helper:0.1.0}"
NODE_HELPER_NAMESPACE="${NODE_HELPER_NAMESPACE:-maintenance}"
NODE_HELPER_SERVICE_ACCOUNT="${NODE_HELPER_SERVICE_ACCOUNT:-default}"
NODE_HELPER_PREWARM_DS="${NODE_HELPER_PREWARM_DS:-ananke-node-helper-prewarm}"
REGISTRY_PULL_SECRET="${REGISTRY_PULL_SECRET:-harbor-regcred}"
REFRESH_BOOTSTRAP_IMAGE_ALIASES="${REFRESH_BOOTSTRAP_IMAGE_ALIASES:-0}"
SKIP_LONGHORN_UNLOCK_BUNDLE_SEED="${SKIP_LONGHORN_UNLOCK_BUNDLE_SEED:-0}"
LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE="${LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE:-${STATE_ROOT}/longhorn_unlock_optional_replicas.tsv}"
KEEP_PREWARM_DAEMONSET=0
BOOTSTRAP_IMAGES_SEEDED=0

RECOVERY_PENDING=0
STARTUP_ATTEMPTED_DURING_OUTAGE=0
LAST_CHECKPOINT="none"
BUNDLE_SERVER_PID=""
UPS_HOST_IN_USE=""

while [[ $# -gt 0 ]]; do
  case "$1" in
    --execute)
      EXECUTE=1
      shift
      ;;
    --shutdown-mode)
      SHUTDOWN_MODE="${2:?missing shutdown mode}"
      shift 2
      ;;
    --expected-flux-branch)
      EXPECTED_FLUX_BRANCH="${2:?missing branch}"
      shift 2
      ;;
    --expected-flux-url)
      EXPECTED_FLUX_URL="${2:?missing flux url}"
      shift 2
      ;;
    --allow-flux-source-mutation)
      ALLOW_FLUX_SOURCE_MUTATION=1
      shift
      ;;
    --force-flux-url)
      FORCE_FLUX_URL="${2:?missing flux url}"
      shift 2
      ;;
    --force-flux-branch)
      FORCE_FLUX_BRANCH="${2:?missing branch}"
      shift 2
      ;;
    --skip-etcd-snapshot)
      SKIP_ETCD_SNAPSHOT=1
      shift
      ;;
    --skip-drain)
      SKIP_DRAIN=1
      shift
      ;;
    --skip-local-bootstrap)
      SKIP_LOCAL_BOOTSTRAP=1
      shift
      ;;
    --skip-harbor-bootstrap)
      SKIP_HARBOR_BOOTSTRAP=1
      shift
      ;;
    --skip-harbor-seed)
      SKIP_HARBOR_SEED=1
      shift
      ;;
    --skip-helper-prewarm)
      SKIP_HELPER_PREWARM=1
      shift
      ;;
    --refresh-bootstrap-image-aliases)
      REFRESH_BOOTSTRAP_IMAGE_ALIASES=1
      shift
      ;;
    --ups-host)
      UPS_HOST="${2:?missing ups host}"
      shift 2
      ;;
    --ups-battery-key)
      UPS_BATTERY_KEY="${2:?missing ups key}"
      shift 2
      ;;
    --min-startup-battery)
      MIN_STARTUP_BATTERY="${2:?missing battery threshold}"
      shift 2
      ;;
    --require-ups-battery)
      REQUIRE_UPS_BATTERY=1
      shift
      ;;
    --recovery-state-file)
      RECOVERY_STATE_FILE="${2:?missing state file path}"
      shift 2
      ;;
    --replica-snapshot-file)
      REPLICA_SNAPSHOT_FILE="${2:?missing replica snapshot file path}"
      shift 2
      ;;
    --harbor-bundle-file)
      HARBOR_BUNDLE_FILE="${2:?missing bundle file path}"
      shift 2
      ;;
    --longhorn-unlock-bundle-file)
      LONGHORN_UNLOCK_BUNDLE_FILE="${2:?missing Longhorn unlock bundle file path}"
      shift 2
      ;;
    --bootstrap-images-file)
      BOOTSTRAP_IMAGES_FILE="${2:?missing bootstrap image list path}"
      shift 2
      ;;
    --longhorn-unlock-images-file)
      LONGHORN_UNLOCK_IMAGES_FILE="${2:?missing Longhorn unlock image list path}"
      shift 2
      ;;
    --longhorn-manager-cache-bundle-file)
      LONGHORN_MANAGER_CACHE_BUNDLE_FILE="${2:?missing Longhorn manager cache bundle file path}"
      shift 2
      ;;
    --skip-longhorn-unlock-bundle-seed)
      SKIP_LONGHORN_UNLOCK_BUNDLE_SEED=1
      shift
      ;;
    --bootstrap-bundle-arch)
      BOOTSTRAP_BUNDLE_ARCH="${2:?missing bootstrap bundle architecture}"
      shift 2
      ;;
    --harbor-target-node)
      HARBOR_TARGET_NODE="${2:?missing harbor target node}"
      shift 2
      ;;
    --harbor-canary-node)
      HARBOR_CANARY_NODE="${2:?missing harbor canary node}"
      shift 2
      ;;
    --harbor-host-label-key)
      HARBOR_HOST_LABEL_KEY="${2:?missing harbor host label key}"
      shift 2
      ;;
    --harbor-canary-image)
      HARBOR_CANARY_IMAGE="${2:?missing canary image}"
      shift 2
      ;;
    --node-helper-image)
      NODE_HELPER_IMAGE="${2:?missing node helper image}"
      shift 2
      ;;
    --bundle-http-port)
      BUNDLE_HTTP_PORT="${2:?missing bundle http port}"
      shift 2
      ;;
    --api-wait-timeout)
      API_WAIT_TIMEOUT_SECONDS="${2:?missing api wait timeout}"
      shift 2
      ;;
    --flux-ready-timeout)
      FLUX_READY_TIMEOUT_SECONDS="${2:?missing flux ready timeout}"
      shift 2
      ;;
    --startup-checklist-timeout)
      STARTUP_CHECKLIST_TIMEOUT_SECONDS="${2:?missing startup checklist timeout}"
      shift 2
      ;;
    --startup-workload-timeout)
      STARTUP_WORKLOAD_TIMEOUT_SECONDS="${2:?missing startup workload timeout}"
      shift 2
      ;;
    --startup-stability-window)
      STARTUP_STABILITY_WINDOW_SECONDS="${2:?missing startup stability window}"
      shift 2
      ;;
    --startup-stability-timeout)
      STARTUP_STABILITY_TIMEOUT_SECONDS="${2:?missing startup stability timeout}"
      shift 2
      ;;
    --drain-timeout)
      DRAIN_TIMEOUT_SECONDS="${2:?missing drain timeout}"
      shift 2
      ;;
    --emergency-drain-timeout)
      EMERGENCY_DRAIN_TIMEOUT_SECONDS="${2:?missing emergency drain timeout}"
      shift 2
      ;;
    -h|--help)
      usage
      exit 0
      ;;
    *)
      echo "Unknown option: $1" >&2
      usage
      exit 1
      ;;
  esac
done

case "${SHUTDOWN_MODE}" in
  host-poweroff|cluster-only) ;;
  *)
    echo "Invalid --shutdown-mode '${SHUTDOWN_MODE}'. Expected host-poweroff or cluster-only." >&2
    exit 1
    ;;
esac

if [[ -n "${FORCE_FLUX_URL}" && "${ALLOW_FLUX_SOURCE_MUTATION}" -ne 1 ]]; then
  echo "--force-flux-url requires --allow-flux-source-mutation (breakglass)." >&2
  exit 1
fi

require_cmd() {
  local cmd="$1"
  if ! command -v "${cmd}" >/dev/null 2>&1; then
    echo "Missing required command: ${cmd}" >&2
    exit 1
  fi
}

require_cmd kubectl
require_cmd bash
require_cmd base64
require_cmd curl

log() { echo "[cluster-power] $*"; }
warn() { echo "[cluster-power][warn] $*" >&2; }
die() { echo "[cluster-power][error] $*" >&2; exit 1; }

run() {
  if [[ "${EXECUTE}" -eq 1 ]]; then
    log "EXEC: $*"
    "$@"
  else
    log "DRY-RUN: $*"
  fi
}

run_shell() {
  if [[ "${EXECUTE}" -eq 1 ]]; then
    log "EXEC: $*"
    bash -lc "$*"
  else
    log "DRY-RUN: $*"
  fi
}

apply_kustomization() {
  local path="$1"
  local full_path="${REPO_DIR}/${path}"
  if [[ "${EXECUTE}" -eq 1 ]]; then
    log "EXEC: kubectl kustomize ${full_path} --load-restrictor=LoadRestrictionsNone | kubectl apply -f -"
    kubectl kustomize "${full_path}" --load-restrictor=LoadRestrictionsNone | kubectl apply -f -
  else
    log "DRY-RUN: kubectl kustomize ${full_path} --load-restrictor=LoadRestrictionsNone | kubectl apply -f -"
  fi
}

sanitize_name() {
  printf '%s' "$1" | tr '[:upper:]' '[:lower:]' | tr -cs 'a-z0-9-' '-'
}

state_dir() {
  dirname "${RECOVERY_STATE_FILE}"
}

load_recovery_state() {
  RECOVERY_PENDING=0
  STARTUP_ATTEMPTED_DURING_OUTAGE=0
  LAST_CHECKPOINT="none"
  [[ -f "${RECOVERY_STATE_FILE}" ]] || return 0
  while IFS='=' read -r key value; do
    case "${key}" in
      recovery_pending) RECOVERY_PENDING="${value}" ;;
      startup_attempted) STARTUP_ATTEMPTED_DURING_OUTAGE="${value}" ;;
      last_checkpoint) LAST_CHECKPOINT="${value}" ;;
    esac
  done < "${RECOVERY_STATE_FILE}"
}

save_recovery_state() {
  [[ "${EXECUTE}" -eq 1 ]] || return 0
  mkdir -p "$(state_dir)"
  cat > "${RECOVERY_STATE_FILE}" <<STATE
recovery_pending=${1}
startup_attempted=${2}
last_checkpoint=${3}
STATE
}

mark_checkpoint() {
  LAST_CHECKPOINT="$1"
  save_recovery_state "${RECOVERY_PENDING}" "${STARTUP_ATTEMPTED_DURING_OUTAGE}" "${LAST_CHECKPOINT}"
}

clear_recovery_state() {
  [[ "${EXECUTE}" -eq 1 ]] || return 0
  rm -f "${RECOVERY_STATE_FILE}" 2>/dev/null || true
  LAST_CHECKPOINT="none"
}

sanitize_battery_percent() {
  local raw="$1"
  raw="${raw##*:}"
  raw="${raw//[[:space:]]/}"
  raw="${raw%%.*}"
  [[ "${raw}" =~ ^[0-9]+$ ]] || return 1
  printf '%s' "${raw}"
}

candidate_ups_hosts() {
  local candidate name
  local -A seen=()
  if [[ -n "${UPS_HOST}" ]]; then
    seen["${UPS_HOST}"]=1
    echo "${UPS_HOST}"
  fi
  while IFS= read -r name; do
    [[ -n "${name}" ]] || continue
    for candidate in "${name}@localhost" "${name}"; do
      [[ -n "${seen[${candidate}]+x}" ]] && continue
      seen["${candidate}"]=1
      echo "${candidate}"
    done
  done < <(upsc -l 2>/dev/null || true)
}

read_ups_battery() {
  if ! command -v upsc >/dev/null 2>&1; then
    return 1
  fi
  local host raw parsed
  while IFS= read -r host; do
    raw="$(upsc "${host}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)"
    [[ -n "${raw}" ]] || continue
    parsed="$(sanitize_battery_percent "${raw}" || true)"
    [[ -n "${parsed}" ]] || continue
    UPS_HOST_IN_USE="${host}"
    printf '%s' "${parsed}"
    return 0
  done < <(candidate_ups_hosts)
  return 1
}

ensure_minimum_battery_for_bootstrap() {
  local battery
  battery="$(read_ups_battery || true)"
  if [[ -z "${battery}" ]]; then
    if [[ "${REQUIRE_UPS_BATTERY}" -eq 1 ]]; then
      warn "Unable to read UPS battery status and --require-ups-battery is set."
      return 1
    fi
    warn "Unable to read UPS battery status; continuing without hard battery gating."
    return 0
  fi
  log "ups-battery=${battery}% host=${UPS_HOST_IN_USE:-${UPS_HOST}}"
  if (( battery < MIN_STARTUP_BATTERY )); then
    warn "UPS battery ${battery}% below minimum startup threshold ${MIN_STARTUP_BATTERY}%."
    return 1
  fi
  return 0
}

report_flux_source_state() {
  local flux_url flux_branch
  flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)"
  flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)"
  [[ -n "${flux_url}" ]] && log "flux-source-url=${flux_url}"
  if [[ -n "${flux_branch}" ]]; then
    log "flux-source-branch=${flux_branch}"
  fi
}

csv_has_value() {
  local csv="$1"
  local value="$2"
  local needle=",${value},"
  local haystack=",${csv},"
  [[ "${haystack}" == *"${needle}"* ]]
}

assert_flux_source_expected() {
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: skipping strict Flux source drift guard"
    return 0
  fi
  local flux_url flux_branch
  flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)"
  flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)"
  [[ -n "${flux_url}" ]] || die "Unable to read Flux source URL from flux-system/gitrepository."
  [[ -n "${flux_branch}" ]] || die "Unable to read Flux source branch from flux-system/gitrepository."

  if [[ -n "${EXPECTED_FLUX_URL}" && "${flux_url}" != "${EXPECTED_FLUX_URL}" ]]; then
    die "Flux source URL drift detected: got '${flux_url}', expected '${EXPECTED_FLUX_URL}'. Refusing startup."
  fi
  if [[ -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then
    die "Flux source branch drift detected: got '${flux_branch}', expected '${EXPECTED_FLUX_BRANCH}'. Use --force-flux-branch to correct."
  fi
}

kustomization_is_optional() {
  local name="$1"
  [[ -n "${STARTUP_OPTIONAL_KUSTOMIZATIONS}" ]] || return 1
  csv_has_value "${STARTUP_OPTIONAL_KUSTOMIZATIONS}" "${name}"
}

list_not_ready_kustomizations() {
  local rows line name ready message
  rows="$(kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io \
    -o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,MESSAGE:.status.conditions[?(@.type=="Ready")].message' \
    --no-headers 2>/dev/null || true)"
  [[ -n "${rows}" ]] || return 0
  while IFS= read -r line; do
    [[ -n "${line}" ]] || continue
    name="$(awk '{print $1}' <<< "${line}")"
    ready="$(awk '{print $2}' <<< "${line}")"
    message="${line#${name} }"
    message="${message#${ready} }"
    if kustomization_is_optional "${name}"; then
      continue
    fi
    if [[ "${ready}" != "True" ]]; then
      printf '%s|%s\n' "${name}" "${message}"
    fi
  done <<< "${rows}"
}

trigger_flux_reconcile_all() {
  local now
  now="$(date --iso-8601=seconds)"
  run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="${now}" --overwrite
  if command -v flux >/dev/null 2>&1; then
    run flux reconcile source git flux-system -n flux-system --timeout=3m
  fi
}

heal_failed_flux_jobs() {
  local rows line ns name failed flux_owner helm_owner healed
  healed=0
  rows="$(kubectl get jobs.batch -A \
    -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,FAILED:.status.failed,FLUX_OWNER:.metadata.labels.kustomize\\.toolkit\\.fluxcd\\.io/name,HELM_OWNER:.metadata.labels.helm\\.toolkit\\.fluxcd\\.io/name \
    --no-headers 2>/dev/null || true)"
  [[ -n "${rows}" ]] || return 1

  while IFS= read -r line; do
    [[ -n "${line}" ]] || continue
    ns="$(awk '{print $1}' <<< "${line}")"
    name="$(awk '{print $2}' <<< "${line}")"
    failed="$(awk '{print $3}' <<< "${line}")"
    flux_owner="$(awk '{print $4}' <<< "${line}")"
    helm_owner="$(awk '{print $5}' <<< "${line}")"
    [[ "${failed}" != "<none>" ]] || continue
    [[ "${failed}" =~ ^[0-9]+$ ]] || continue
    (( failed > 0 )) || continue
    if [[ "${flux_owner}" == "<none>" && "${helm_owner}" == "<none>" ]]; then
      continue
    fi
    warn "Deleting failed Flux-managed Job ${ns}/${name} to heal immutable-template drift."
    run kubectl -n "${ns}" delete job "${name}" --ignore-not-found
    healed=1
  done <<< "${rows}"

  (( healed == 1 ))
}

wait_for_flux_kustomizations_ready() {
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: skipping wait for all Flux kustomizations Ready"
    return 0
  fi
  local start now not_ready immutable_hits
  start="$(date +%s)"
  immutable_hits=0
  while true; do
    not_ready="$(list_not_ready_kustomizations || true)"
    if [[ -z "${not_ready}" ]]; then
      log "flux-kustomizations=all-ready"
      return 0
    fi

    log "flux-kustomizations-not-ready:"
    while IFS= read -r line; do
      [[ -n "${line}" ]] || continue
      log "  ${line}"
    done <<< "${not_ready}"

    if grep -Eqi 'immutable|field is immutable|cannot patch.*Job|Job.*invalid' <<< "${not_ready}"; then
      if (( immutable_hits < 3 )); then
        immutable_hits=$(( immutable_hits + 1 ))
        warn "Detected immutable Job failure signal in Flux status. Attempting automated Job cleanup (${immutable_hits}/3)."
        if heal_failed_flux_jobs; then
          trigger_flux_reconcile_all
        fi
      fi
    fi

    now="$(date +%s)"
    if (( now - start >= FLUX_READY_TIMEOUT_SECONDS )); then
      die "Timed out waiting for Flux kustomizations Ready after ${FLUX_READY_TIMEOUT_SECONDS}s."
    fi
    sleep "${FLUX_READY_POLL_SECONDS}"
  done
}

default_startup_service_checklist() {
  cat <<'CHECKS'
gitea|https://scm.bstein.dev/api/healthz|200|"status":"pass"||
grafana|https://metrics.bstein.dev/api/health|200|"database":"ok"||
harbor|https://registry.bstein.dev/v2/|401|unauthorized|<html|
CHECKS
}

list_ingress_hosts() {
  kubectl get ingress -A -o jsonpath='{range .items[*]}{range .spec.rules[*]}{.host}{"\n"}{end}{end}' 2>/dev/null \
    | sed '/^[[:space:]]*$/d' \
    | sort -u
}

generated_ingress_service_checks() {
  local host
  while IFS= read -r host; do
    [[ -n "${host}" ]] || continue
    if [[ -n "${STARTUP_IGNORE_INGRESS_HOSTS_REGEX}" ]] && [[ "${host}" =~ ${STARTUP_IGNORE_INGRESS_HOSTS_REGEX} ]]; then
      continue
    fi
    printf 'ingress-%s|https://%s/|%s|||0|%s\n' "${host}" "${host}" "${STARTUP_INGRESS_ALLOWED_STATUSES}" "${STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS}"
  done < <(list_ingress_hosts)
}

startup_service_checklist_rows() {
  local base
  if [[ -n "${STARTUP_SERVICE_CHECKLIST}" ]]; then
    base="$(printf '%s' "${STARTUP_SERVICE_CHECKLIST}" | tr ';' '\n')"
  else
    base="$(default_startup_service_checklist)"
  fi

  printf '%s\n' "${base}" | sed '/^[[:space:]]*$/d'
  if [[ "${STARTUP_INCLUDE_INGRESS_CHECKS}" == "1" || "${STARTUP_INCLUDE_INGRESS_CHECKS}" == "true" ]]; then
    generated_ingress_service_checks
  fi
}

service_status_allowed() {
  local expected_csv="$1"
  local got="$2"
  local token
  IFS=',' read -r -a _statuses <<< "${expected_csv}"
  for token in "${_statuses[@]}"; do
    if [[ "${token}" == "${got}" ]]; then
      return 0
    fi
  done
  return 1
}

check_mail_safeguards_once() {
  local quiet="${1:-0}"
  local failures=0 namespace service host port ips
  local -a services=() ports=()
  if [[ "${STARTUP_REQUIRE_MAIL_SAFEGUARDS}" != "1" && "${STARTUP_REQUIRE_MAIL_SAFEGUARDS}" != "true" ]]; then
    return 0
  fi

  namespace="${MAIL_STARTUP_NAMESPACE}"
  as_array_from_csv "${MAIL_STARTUP_ENDPOINT_SERVICES}" services
  for service in "${services[@]}"; do
    service="${service//[[:space:]]/}"
    [[ -n "${service}" ]] || continue
    ips="$(kubectl -n "${namespace}" get endpoints "${service}" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null || true)"
    if [[ -z "${ips//[[:space:]]/}" ]]; then
      if [[ "${quiet}" != "1" ]]; then
        warn "startup-check mail-endpoints ${namespace}/${service}: no ready endpoints."
      fi
      failures=1
    fi
  done

  host="${MAIL_STARTUP_HOST}"
  if [[ -n "${host}" ]]; then
    as_array_from_csv "${MAIL_STARTUP_TCP_PORTS}" ports
    for port in "${ports[@]}"; do
      port="${port//[[:space:]]/}"
      [[ "${port}" =~ ^[0-9]+$ ]] || continue
      if ! timeout "${MAIL_STARTUP_TCP_TIMEOUT_SECONDS}" bash -lc "</dev/tcp/${host}/${port}" >/dev/null 2>&1; then
        if [[ "${quiet}" != "1" ]]; then
          warn "startup-check mail-tcp ${host}:${port}: connect failed."
        fi
        failures=1
      fi
    done
  fi

  (( failures == 0 ))
}

check_startup_service_checklist_once() {
  local rows row name url expected body_must body_must_not insecure timeout code rc
  local body_file failures
  failures=0
  rows="$(startup_service_checklist_rows)"
  while IFS= read -r row; do
    [[ -n "${row}" ]] || continue
    IFS='|' read -r name url expected body_must body_must_not insecure timeout <<< "${row}"
    [[ -n "${name}" && -n "${url}" && -n "${expected}" ]] || continue
    [[ -n "${insecure}" ]] || insecure=0
    [[ -n "${timeout}" ]] || timeout="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS}"
    body_file="$(mktemp)"
    rc=0
    if [[ "${insecure}" == "1" || "${insecure}" == "true" ]]; then
      code="$(curl -ksS --max-time "${timeout}" -o "${body_file}" -w '%{http_code}' "${url}" || rc=$?)"
    else
      code="$(curl -sS --max-time "${timeout}" -o "${body_file}" -w '%{http_code}' "${url}" || rc=$?)"
    fi
    if (( rc != 0 )); then
      warn "startup-check ${name}: request failed (rc=${rc}) url=${url}"
      failures=1
      rm -f "${body_file}"
      continue
    fi
    if ! service_status_allowed "${expected}" "${code}"; then
      warn "startup-check ${name}: expected status ${expected}, got ${code} url=${url}"
      failures=1
      rm -f "${body_file}"
      continue
    fi
    if [[ -n "${body_must}" ]] && ! grep -Fq -- "${body_must}" "${body_file}"; then
      warn "startup-check ${name}: missing required body fragment '${body_must}'"
      failures=1
      rm -f "${body_file}"
      continue
    fi
    if [[ -n "${body_must_not}" ]] && grep -Fq -- "${body_must_not}" "${body_file}"; then
      warn "startup-check ${name}: forbidden body fragment '${body_must_not}' present"
      failures=1
      rm -f "${body_file}"
      continue
    fi
    rm -f "${body_file}"
  done <<< "${rows}"
  if ! check_mail_safeguards_once; then
    failures=1
  fi
  (( failures == 0 ))
}

wait_for_startup_service_checklist() {
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: skipping startup external service checklist wait"
    return 0
  fi
  local start now checklist_ok workloads_ok
  start="$(date +%s)"
  while true; do
    checklist_ok=0
    workloads_ok=0
    if check_startup_service_checklist_once; then
      checklist_ok=1
    fi
    if list_unhealthy_workloads | sed '/^[[:space:]]*$/d' | grep -q .; then
      workloads_ok=0
    else
      workloads_ok=1
    fi
    if (( checklist_ok == 1 && workloads_ok == 1 )); then
      log "startup-checklist=all-passed"
      return 0
    fi
    if (( workloads_ok == 0 )); then
      warn "startup-checklist: workloads are not fully ready yet."
    fi
    now="$(date +%s)"
    if (( now - start >= STARTUP_CHECKLIST_TIMEOUT_SECONDS )); then
      die "Timed out waiting for startup external checklist after ${STARTUP_CHECKLIST_TIMEOUT_SECONDS}s."
    fi
    sleep "${STARTUP_CHECKLIST_POLL_SECONDS}"
  done
}

collect_unstable_pods() {
  local rows
  rows="$(kubectl get pods -A --no-headers 2>/dev/null \
    | awk '$4 ~ /(CrashLoopBackOff|ImagePullBackOff|ErrImagePull|CreateContainerConfigError|RunContainerError|InvalidImageName)/ {print $1 "/" $2 "|" $4}' || true)"
  if [[ -n "${STARTUP_IGNORE_PODS_REGEX}" ]]; then
    rows="$(printf '%s\n' "${rows}" | grep -Ev "${STARTUP_IGNORE_PODS_REGEX}" || true)"
  fi
  printf '%s' "${rows}"
}

wait_for_startup_stability_window() {
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: skipping startup stability window"
    return 0
  fi
  local hard_deadline stable_since now unstable pods not_ready unhealthy_workloads
  stable_since="$(date +%s)"
  hard_deadline=$(( stable_since + STARTUP_STABILITY_TIMEOUT_SECONDS ))
  while true; do
    unstable=0
    not_ready="$(list_not_ready_kustomizations || true)"
    if [[ -n "${not_ready}" ]]; then
      unstable=1
      warn "stability-window: Flux kustomizations not ready."
    fi
    pods="$(collect_unstable_pods || true)"
    if [[ -n "${pods}" ]]; then
      unstable=1
      warn "stability-window: unstable pods detected."
      while IFS= read -r line; do
        [[ -n "${line}" ]] || continue
        warn "  ${line}"
      done <<< "${pods}"
    fi
    if ! check_startup_service_checklist_once; then
      unstable=1
      warn "stability-window: external service checklist failed."
    fi
    unhealthy_workloads="$(list_unhealthy_workloads || true)"
    if [[ -n "${unhealthy_workloads}" ]]; then
      unstable=1
      warn "stability-window: workloads not fully ready."
      while IFS= read -r line; do
        [[ -n "${line}" ]] || continue
        warn "  ${line}"
      done <<< "${unhealthy_workloads}"
    fi

    now="$(date +%s)"
    if (( unstable == 0 )); then
      if (( now - stable_since >= STARTUP_STABILITY_WINDOW_SECONDS )); then
        log "startup-stability-window=passed (${STARTUP_STABILITY_WINDOW_SECONDS}s)"
        return 0
      fi
    else
      stable_since="${now}"
    fi

    if (( now >= hard_deadline )); then
      die "Timed out waiting for startup stability window (${STARTUP_STABILITY_WINDOW_SECONDS}s healthy) within ${STARTUP_STABILITY_TIMEOUT_SECONDS}s."
    fi
    sleep "${STARTUP_STABILITY_POLL_SECONDS}"
  done
}

wait_for_api() {
  local attempts=$(( API_WAIT_TIMEOUT_SECONDS / 5 ))
  if (( attempts < 1 )); then
    attempts=1
  fi
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: skipping live Kubernetes API wait"
    return 0
  fi
  local i
  for i in $(seq 1 "${attempts}"); do
    if kubectl version --request-timeout=5s >/dev/null 2>&1; then
      return 0
    fi
    sleep 5
  done
  return 1
}

patch_flux_suspend_all() {
  local value="$1"
  local patch
  patch=$(printf '{"spec":{"suspend":%s}}' "${value}")

  local ks_list hr_list
  ks_list="$(kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' || true)"
  hr_list="$(kubectl get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' || true)"

  while IFS= read -r k; do
    [[ -z "${k}" ]] && continue
    run kubectl -n flux-system patch kustomization "${k}" --type=merge -p "${patch}"
  done <<< "${ks_list}"

  while IFS= read -r hr; do
    [[ -z "${hr}" ]] && continue
    local ns="${hr%%/*}"
    local name="${hr##*/}"
    run kubectl -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}"
  done <<< "${hr_list}"
}

patch_kustomization_suspend() {
  local name="$1"
  local value="$2"
  local patch
  patch=$(printf '{"spec":{"suspend":%s}}' "${value}")
  if kubectl -n flux-system get kustomization "${name}" >/dev/null 2>&1; then
    run kubectl -n flux-system patch kustomization "${name}" --type=merge -p "${patch}"
  else
    warn "Flux Kustomization ${name} not found; skipping suspend=${value}."
  fi
}

csv_each() {
  local csv="$1"
  local item
  IFS=',' read -r -a _csv_items <<< "${csv}"
  for item in "${_csv_items[@]}"; do
    item="${item//[[:space:]]/}"
    [[ -n "${item}" ]] || continue
    printf '%s\n' "${item}"
  done
}

save_recovery_optional_flux_snapshot() {
  [[ "${EXECUTE}" -eq 1 ]] || return 0
  mkdir -p "$(dirname "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}")"
  : > "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}"
  local name suspend
  while IFS= read -r name; do
    if ! kubectl -n flux-system get kustomization "${name}" >/dev/null 2>&1; then
      continue
    fi
    suspend="$(kubectl -n flux-system get kustomization "${name}" -o jsonpath='{.spec.suspend}' 2>/dev/null || true)"
    [[ -n "${suspend}" ]] || suspend="false"
    printf '%s\t%s\n' "${name}" "${suspend}" >> "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}"
  done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}")
  log "recovery-flux-optional-snapshot=${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}"
}

patch_recovery_optional_flux_suspend() {
  local value="$1"
  local name
  if [[ "${value}" == "true" ]]; then
    save_recovery_optional_flux_snapshot
  fi
  while IFS= read -r name; do
    patch_kustomization_suspend "${name}" "${value}"
  done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}")
}

patch_recovery_optional_flux_suspend_without_snapshot() {
  local value="$1"
  local name
  while IFS= read -r name; do
    patch_kustomization_suspend "${name}" "${value}"
  done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}")
}

patch_recovery_critical_flux_suspend() {
  local value="$1"
  local name
  while IFS= read -r name; do
    patch_kustomization_suspend "${name}" "${value}"
  done < <(csv_each "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}")
}

patch_recovery_critical_helmrelease_suspend() {
  local value="$1"
  local ref namespace name
  while IFS= read -r ref; do
    namespace="${ref%%/*}"
    name="${ref##*/}"
    [[ -n "${namespace}" && -n "${name}" && "${namespace}" != "${name}" ]] || continue
    patch_helmrelease_suspend "${namespace}" "${name}" "${value}"
  done < <(csv_each "${RECOVERY_FLUX_CRITICAL_HELMRELEASES}")
}

recovery_flux_unsuspended_list() {
  local names=()
  local name suspend
  if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
    names+=("flux-system")
  fi
  while IFS= read -r name; do
    names+=("${name}")
  done < <(csv_each "${RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS}")

  for name in "${names[@]}"; do
    if ! kubectl -n flux-system get kustomization "${name}" >/dev/null 2>&1; then
      continue
    fi
    suspend="$(kubectl -n flux-system get kustomization "${name}" -o jsonpath='{.spec.suspend}' 2>/dev/null || true)"
    if [[ "${suspend}" != "true" ]]; then
      printf '%s\n' "${name}"
    fi
  done
}

wait_for_kustomize_controller_scaled_down() {
  [[ "${EXECUTE}" -eq 1 ]] || return 0

  local deadline pods
  deadline=$((SECONDS + 90))
  while (( SECONDS < deadline )); do
    pods="$(kubectl -n flux-system get pods -l app=kustomize-controller -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
    if [[ -z "${pods//[[:space:]]/}" ]]; then
      return 0
    fi
    sleep 2
  done

  warn "Timed out waiting for kustomize-controller pods to terminate before final Flux suspend reassertion."
  return 1
}

force_recovery_flux_suspend_with_controller_stop() {
  [[ "${EXECUTE}" -eq 1 ]] || return 0

  if ! kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then
    warn "kustomize-controller deployment not found; cannot use controller-stop Flux suspend finalization."
    return 1
  fi

  warn "Stopping kustomize-controller for final Flux suspend reassertion."
  run kubectl -n flux-system scale deployment kustomize-controller --replicas=0
  wait_for_kustomize_controller_scaled_down || true

  if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
    patch_kustomization_suspend flux-system true
  fi
  patch_recovery_optional_flux_suspend_without_snapshot true

  if [[ "${RECOVERY_FLUX_FINAL_RESTART_KUSTOMIZE_CONTROLLER}" == "1" || "${RECOVERY_FLUX_FINAL_RESTART_KUSTOMIZE_CONTROLLER}" == "true" ]]; then
    run kubectl -n flux-system scale deployment kustomize-controller --replicas=1
    kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=2m || warn "kustomize-controller did not become Ready after final Flux suspend reassertion."
    sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}"
  else
    warn "Leaving kustomize-controller stopped to preserve the recovery Flux hold."
    sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}"
    if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
      patch_kustomization_suspend flux-system true
    fi
    patch_recovery_optional_flux_suspend_without_snapshot true
  fi

  local unsuspended
  unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
  if [[ -n "${unsuspended}" ]]; then
    warn "Flux suspend state is still not stable after controller-stop finalization: ${unsuspended}"
    return 1
  fi

  log "recovery-flux-suspend=verified-controller-stop"
}

stabilize_recovery_flux_suspend() {
  [[ "${EXECUTE}" -eq 1 ]] || return 0

  local attempt unsuspended
  for attempt in $(seq 1 "${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS}"); do
    if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
      patch_kustomization_suspend flux-system true
    fi
    patch_recovery_optional_flux_suspend_without_snapshot true
    sleep "${RECOVERY_FLUX_SUSPEND_VERIFY_SLEEP_SECONDS}"

    unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
    if [[ -z "${unsuspended}" ]]; then
      log "recovery-flux-suspend=verified attempts=${attempt}"
      return 0
    fi
    warn "Flux suspend state was overwritten during recovery thaw; reasserting attempt ${attempt}/${RECOVERY_FLUX_SUSPEND_VERIFY_ATTEMPTS}: ${unsuspended}"
  done

  unsuspended="$(recovery_flux_unsuspended_list | paste -sd, -)"
  if [[ -n "${unsuspended}" ]]; then
    warn "Flux suspend state is still not stable after verification attempts: ${unsuspended}"
    force_recovery_flux_suspend_with_controller_stop
  fi
}

restore_recovery_optional_flux_suspend() {
  [[ -f "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}" ]] || return 0
  local name suspend
  while IFS=$'\t' read -r name suspend; do
    [[ -n "${name}" ]] || continue
    [[ "${suspend}" == "true" || "${suspend}" == "false" ]] || suspend="false"
    patch_kustomization_suspend "${name}" "${suspend}"
  done < "${RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE}"
}

annotate_flux_kustomizations() {
  local now name
  now="$(date --iso-8601=seconds)"
  while IFS= read -r name; do
    if kubectl -n flux-system get kustomization "${name}" >/dev/null 2>&1; then
      run kubectl -n flux-system annotate kustomization "${name}" reconcile.fluxcd.io/requestedAt="${now}" --overwrite
    fi
  done < <(csv_each "$1")
}

restart_kustomize_controller_for_critical_thaw() {
  if [[ "${RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER}" != "1" && "${RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER}" != "true" ]]; then
    return 0
  fi
  if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then
    warn "Restarting kustomize-controller after optional Flux suspension to clear any single-worker health-check backlog."
    run kubectl -n flux-system rollout restart deployment kustomize-controller
  fi
}

prepare_recovery_flux_critical_thaw() {
  [[ "${EXECUTE}" -eq 1 ]] || return 0

  if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then
    warn "Stopping kustomize-controller to create a quiet Flux critical-thaw window."
    run kubectl -n flux-system scale deployment kustomize-controller --replicas=0
    wait_for_kustomize_controller_scaled_down || true
  fi

  patch_recovery_optional_flux_suspend true
  patch_flux_suspend_all true

  if command -v flux >/dev/null 2>&1; then
    run flux reconcile source git flux-system -n flux-system --timeout=3m || true
  fi

  patch_recovery_critical_flux_suspend false
  patch_recovery_critical_helmrelease_suspend false
  if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
    patch_kustomization_suspend flux-system true
  fi
  patch_recovery_optional_flux_suspend_without_snapshot true

  if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then
    run kubectl -n flux-system scale deployment kustomize-controller --replicas=1
    kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=2m || warn "kustomize-controller did not become Ready for Flux critical thaw."
  fi
}

patch_helmrelease_suspend() {
  local namespace="$1"
  local name="$2"
  local value="$3"
  local patch
  patch=$(printf '{"spec":{"suspend":%s}}' "${value}")
  if kubectl -n "${namespace}" get helmrelease "${name}" >/dev/null 2>&1; then
    run kubectl -n "${namespace}" patch helmrelease "${name}" --type=merge -p "${patch}"
  else
    warn "HelmRelease ${namespace}/${name} not found; skipping suspend=${value}."
  fi
}

wait_for_flux_reconciler_pods_stopped() {
  local app start now pods
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: wait for Flux reconcilers to stop"
    return 0
  fi
  for app in kustomize-controller helm-controller; do
    start="$(date +%s)"
    while true; do
      pods="$(kubectl -n flux-system get pods -l "app=${app}" --no-headers 2>/dev/null || true)"
      if [[ -z "${pods}" ]]; then
        log "flux-reconciler-stopped=${app}"
        break
      fi
      now="$(date +%s)"
      if (( now - start >= 90 )); then
        warn "Timed out waiting for ${app} pods to stop."
        break
      fi
      sleep 2
    done
  done
}

freeze_longhorn_deadlock_automation() {
  warn "Freezing only the automation that can fight Longhorn emergency recovery."
  if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>&1; then
    run kubectl -n flux-system scale deployment kustomize-controller --replicas=0
  fi
  if kubectl -n flux-system get deployment helm-controller >/dev/null 2>&1; then
    run kubectl -n flux-system scale deployment helm-controller --replicas=0
  fi
  wait_for_flux_reconciler_pods_stopped
  patch_kustomization_suspend flux-system true
  patch_kustomization_suspend helm true
  patch_kustomization_suspend longhorn true
  patch_helmrelease_suspend longhorn-system longhorn true
  mark_checkpoint longhorn_unlock_automation_frozen
}

ensure_longhorn_cache_first_policy() {
  local values_patch ds_patch
  values_patch='{"spec":{"values":{"image":{"pullPolicy":"IfNotPresent"},"defaultSettings":{"systemManagedPodsImagePullPolicy":"if-not-present"}}}}'
  ds_patch='{"spec":{"template":{"spec":{"containers":[{"name":"longhorn-manager","imagePullPolicy":"IfNotPresent"}]}}}}'
  if kubectl -n longhorn-system get helmrelease longhorn >/dev/null 2>&1; then
    run kubectl -n longhorn-system patch helmrelease longhorn --type=merge -p "${values_patch}"
  fi
  if kubectl -n longhorn-system get daemonset longhorn-manager >/dev/null 2>&1; then
    run kubectl -n longhorn-system patch daemonset longhorn-manager --type=strategic -p "${ds_patch}"
  fi
}

remove_longhorn_manager_prepull_sidecar() {
  local indexes index
  indexes="$(kubectl -n longhorn-system get daemonset longhorn-manager \
    -o jsonpath='{range .spec.template.spec.containers[*]}{.name}{"\n"}{end}' 2>/dev/null \
    | nl -v 0 -w 1 -s ' ' \
    | awk '$2=="pre-pull-share-manager-image" {print $1}' \
    | sort -rn || true)"
  if [[ -z "${indexes}" ]]; then
    log "longhorn-manager-prepull-sidecar=absent"
    return 0
  fi
  while IFS= read -r index; do
    [[ -z "${index}" ]] && continue
    run kubectl -n longhorn-system patch daemonset longhorn-manager --type=json \
      -p "[{\"op\":\"remove\",\"path\":\"/spec/template/spec/containers/${index}\"}]"
  done <<< "${indexes}"
}

longhorn_manager_prepull_sidecar_has_pull_failures() {
  kubectl -n longhorn-system get pods -l app=longhorn-manager -o json \
    | jq -e '
      [
        .items[].status.containerStatuses[]?
        | select(.name == "pre-pull-share-manager-image")
        | select(((.state.waiting.reason // "") | test("ImagePullBackOff|ErrImagePull|CreateContainerError|RunContainerError|InvalidImageName")))
      ]
      | length > 0' >/dev/null 2>&1
}

remove_longhorn_manager_prepull_sidecar_if_needed() {
  if ! harbor_endpoint_is_ready 1; then
    warn "Removing Longhorn manager pre-pull sidecar because Harbor registry API is unhealthy."
    remove_longhorn_manager_prepull_sidecar
    return 0
  fi
  if longhorn_manager_prepull_sidecar_has_pull_failures; then
    warn "Removing Longhorn manager pre-pull sidecar because it is in image/runtime failure."
    remove_longhorn_manager_prepull_sidecar
    return 0
  fi
  log "longhorn-manager-prepull-sidecar=retained harbor=healthy pull_failures=false"
}

save_longhorn_unlock_optional_replica_snapshot() {
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: save optional workload snapshot to ${LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE}"
    return 0
  fi
  if [[ -s "${LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE}" ]]; then
    log "optional-workload-snapshot=preserved path=${LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE}"
    return 0
  fi
  mkdir -p "$(dirname "${LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE}")"
  : > "${LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE}"
}

scale_optional_workload_for_longhorn_unlock() {
  local namespace="$1"
  local kind="$2"
  local name="$3"
  local replicas
  if ! kubectl -n "${namespace}" get "${kind}" "${name}" >/dev/null 2>&1; then
    return 0
  fi
  replicas="$(kubectl -n "${namespace}" get "${kind}" "${name}" -o jsonpath='{.spec.replicas}' 2>/dev/null || true)"
  [[ -n "${replicas}" ]] || replicas=1
  if [[ "${EXECUTE}" -eq 1 ]] && ! awk -F '\t' -v ns="${namespace}" -v kind="${kind}" -v name="${name}" '$1==ns && $2==kind && $3==name {found=1} END {exit found ? 0 : 1}' "${LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE}" 2>/dev/null; then
    printf '%s\t%s\t%s\t%s\n' "${namespace}" "${kind}" "${name}" "${replicas}" >> "${LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE}"
  fi
  if [[ "${replicas}" == "0" ]]; then
    log "optional-workload-already-scaled-down=${namespace}/${kind}/${name}"
    return 0
  fi
  warn "Temporarily scaling optional workload ${namespace}/${kind}/${name} from ${replicas} to 0 for Longhorn recovery headroom."
  run kubectl -n "${namespace}" scale "${kind}" "${name}" --replicas=0
}

free_longhorn_instance_manager_headroom() {
  save_longhorn_unlock_optional_replica_snapshot
  while read -r namespace kind name; do
    [[ -z "${namespace}" || "${namespace}" == \#* ]] && continue
    scale_optional_workload_for_longhorn_unlock "${namespace}" "${kind}" "${name}"
  done <<'WORKLOADS'
game-stream deployment oauth2-proxy-wolf
logging deployment oauth2-proxy-logs
longhorn-system deployment oauth2-proxy-longhorn
maintenance deployment oauth2-proxy-metis
maintenance deployment oauth2-proxy-soteria
openclaw deployment oauth2-proxy-agent
quality deployment oauth2-proxy-sonarqube
quality deployment sonarqube-exporter
sso deployment oauth2-proxy
bstein-dev-home deployment bstein-dev-home-frontend
WORKLOADS
  mark_checkpoint longhorn_unlock_optional_workloads_scaled
}

restore_longhorn_unlock_optional_workloads() {
  local namespace kind name desired current
  if [[ ! -f "${LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE}" ]]; then
    log "optional-workload-restore=not-needed snapshot=absent"
    return 0
  fi

  while IFS=$'\t' read -r namespace kind name desired; do
    [[ -n "${namespace}" && -n "${kind}" && -n "${name}" && -n "${desired}" ]] || continue
    [[ "${desired}" =~ ^[0-9]+$ ]] || continue
    (( desired > 0 )) || continue
    current="$(kubectl -n "${namespace}" get "${kind}" "${name}" -o jsonpath='{.spec.replicas}' 2>/dev/null || true)"
    [[ "${current}" =~ ^[0-9]+$ ]] || continue
    if (( current == desired )); then
      continue
    fi
    warn "Restoring optional workload ${namespace}/${kind}/${name} to replicas=${desired} after Longhorn unlock."
    run kubectl -n "${namespace}" scale "${kind}" "${name}" --replicas="${desired}"
  done < "${LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE}"
  mark_checkpoint longhorn_unlock_optional_workloads_restored
}

restore_recovered_worker_scheduling_after_deadlock() {
  local rows node ready worker taints
  rows="$(kubectl get nodes -o json \
    | jq -r '.items[]
      | [.metadata.name,
         (.spec.unschedulable // false),
         ([.status.conditions[]? | select(.type=="Ready") | .status][0] // "Unknown"),
         (.metadata.labels["node-role.kubernetes.io/worker"] // ""),
         ((.spec.taints // []) | map(.key + ":" + .effect) | join(","))]
      | @tsv' || true)"

  while IFS=$'\t' read -r node unschedulable ready worker taints; do
    [[ -n "${node}" ]] || continue
    [[ "${unschedulable}" == "true" ]] || continue
    [[ "${ready}" == "True" ]] || continue
    [[ "${worker}" == "true" ]] || continue
    if csv_has_value "${RECOVERY_UNCORDON_DENYLIST}" "${node}"; then
      warn "Leaving recovered worker ${node} cordoned because it is in RECOVERY_UNCORDON_DENYLIST."
      continue
    fi
    if [[ "${taints}" == *"node.kubernetes.io/unreachable:"* ]]; then
      warn "Leaving worker ${node} cordoned because it still has an unreachable taint."
      continue
    fi
    warn "Restoring scheduling on recovered Ready worker ${node}."
    run kubectl uncordon "${node}"
  done <<< "${rows}"
  mark_checkpoint longhorn_unlock_worker_scheduling_restored
}

delete_failed_nonstorage_pods_for_headroom() {
  local rows namespace name
  rows="$(kubectl get pods -A --field-selector=status.phase=Failed \
    -o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
  while read -r namespace name; do
    [[ -z "${namespace}" || -z "${name}" ]] && continue
    case "${namespace}" in
      longhorn-system|postgres|vault|gitea|harbor)
        continue
        ;;
    esac
    run kubectl -n "${namespace}" delete pod "${name}" --ignore-not-found --wait=false
  done <<< "${rows}"
}

restart_stale_critical_pods_after_longhorn_unlock() {
  require_cmd jq
  local pods namespace name phase owners
  pods="$(kubectl get pods -A -o json \
    | jq -r '.items[]
      | select(.metadata.namespace | test("^(postgres|vault|gitea|harbor)$"))
      | select(.status.phase == "Failed" or .status.phase == "Unknown")
      | [.metadata.namespace, .metadata.name, .status.phase, ((.metadata.ownerReferences // []) | length)] | @tsv' || true)"
  while IFS=$'\t' read -r namespace name phase owners; do
    [[ -z "${namespace}" || -z "${name}" ]] && continue
    if [[ "${owners}" == "0" ]]; then
      warn "Skipping stale critical pod without controller owner: ${namespace}/${name} phase=${phase}"
      continue
    fi
    warn "Deleting stale controller-owned critical pod ${namespace}/${name} phase=${phase} so its controller can recreate it."
    run kubectl -n "${namespace}" delete pod "${name}" --ignore-not-found --wait=false
  done <<< "${pods}"

  pods="$(kubectl get pods -A -o json \
    | jq -r '.items[]
      | select(.metadata.namespace | test("^(postgres|vault|gitea|harbor)$"))
      | select(.metadata.deletionTimestamp != null)
      | select(.status.phase == "Failed" or .status.phase == "Unknown")
      | select(((.metadata.finalizers // []) | length) == 0)
      | select(((.metadata.ownerReferences // []) | length) > 0)
      | select(([(.status.containerStatuses[]? | select(.state.terminated != null))] | length) == ((.status.containerStatuses // []) | length))
      | [.metadata.namespace, .metadata.name, .status.phase] | @tsv' || true)"
  while IFS=$'\t' read -r namespace name phase; do
    [[ -z "${namespace}" || -z "${name}" ]] && continue
    warn "Force-deleting stale terminating critical pod object ${namespace}/${name} phase=${phase}; containers are already terminated and no finalizers are set."
    run kubectl -n "${namespace}" delete pod "${name}" --ignore-not-found --wait=false --force --grace-period=0
  done <<< "${pods}"
}

wait_for_postgres_dependency_ready() {
  local timeout_seconds="${1:-240}"
  local start now endpoints
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: wait for postgres/postgres-service endpoints and pg_isready"
    return 0
  fi

  start="$(date +%s)"
  while true; do
    endpoints="$(kubectl -n postgres get endpoints postgres-service -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null || true)"
    if [[ -n "${endpoints//[[:space:]]/}" ]] \
      && kubectl -n postgres exec postgres-0 -c postgres -- sh -ceu 'pg_isready -h 127.0.0.1 -p 5432 >/dev/null' >/dev/null 2>&1; then
      log "postgres-dependency=ready endpoints=${endpoints}"
      return 0
    fi

    now="$(date +%s)"
    if (( now - start >= timeout_seconds )); then
      warn "Timed out waiting for Postgres to become ready for Harbor."
      return 1
    fi
    sleep 5
  done
}

restart_harbor_after_postgres_recovery() {
  require_cmd jq
  local pods name

  if harbor_endpoint_is_ready 1; then
    log "harbor-postgres-recovery=not-needed"
    return 0
  fi

  wait_for_postgres_dependency_ready 240 || return 1

  pods="$(kubectl -n harbor get pods -o json \
    | jq -r '.items[]
      | select(.metadata.name | test("^harbor-(core|jobservice)-"))
      | select(((.metadata.ownerReferences // []) | length) > 0)
      | select(([
          .status.containerStatuses[]?
          | select(.name == "core" or .name == "jobservice")
          | select((.ready != true)
            or (((.state.waiting.reason // "") | test("CrashLoopBackOff|ImagePullBackOff|ErrImagePull")))
            or ((.lastState.terminated.reason // "") == "Error"))
        ] | length) > 0)
      | .metadata.name' \
    | sort -u || true)"

  if [[ -z "${pods}" ]]; then
    warn "Harbor registry API is unhealthy, but no controller-owned core/jobservice pod needs restart."
    return 1
  fi

  while IFS= read -r name; do
    [[ -z "${name}" ]] && continue
    warn "Restarting controller-owned Harbor pod ${name} after Postgres recovery."
    run kubectl -n harbor delete pod "${name}" --ignore-not-found --wait=false
  done <<< "${pods}"

  if [[ "${EXECUTE}" -eq 1 ]]; then
    kubectl -n harbor rollout status deployment/harbor-core --timeout=6m || warn "harbor-core did not become Ready after Postgres recovery restart."
    kubectl -n harbor rollout status deployment/harbor-jobservice --timeout=6m || warn "harbor-jobservice did not become Ready after Postgres recovery restart."
    harbor_endpoint_is_ready 0 || return 1
  fi
  mark_checkpoint longhorn_unlock_harbor_postgres_recovered
}

delete_safe_stale_terminating_replicaset_pods_after_deadlock() {
  require_cmd jq
  local rows namespace name deleted_at deleted_epoch now age
  now="$(date +%s)"
  rows="$(kubectl get pods -A -o json \
    | jq -r '.items[]
      | select(.metadata.namespace != "longhorn-system")
      | select(.metadata.deletionTimestamp != null)
      | select(((.metadata.finalizers // []) | length) == 0)
      | select(((.metadata.ownerReferences // []) | map(select(.kind=="ReplicaSet")) | length) > 0)
      | ([(.status.initContainerStatuses[]?, .status.containerStatuses[]?) | select(.state.running != null)] | length) as $running
      | ([(.status.initContainerStatuses[]?, .status.containerStatuses[]?) | select(.ready == true)] | length) as $ready
      | select($running == 0 and $ready == 0)
      | [.metadata.namespace, .metadata.name, .metadata.deletionTimestamp] | @tsv' || true)"

  while IFS=$'\t' read -r namespace name deleted_at; do
    [[ -n "${namespace}" && -n "${name}" && -n "${deleted_at}" ]] || continue
    deleted_epoch="$(date -d "${deleted_at}" +%s 2>/dev/null || true)"
    [[ "${deleted_epoch}" =~ ^[0-9]+$ ]] || continue
    age=$(( now - deleted_epoch ))
    if (( age < STALE_TERMINATING_POD_SECONDS )); then
      continue
    fi
    warn "Force-deleting stale terminating ReplicaSet pod ${namespace}/${name}; no containers are running and no finalizers are set."
    run kubectl -n "${namespace}" delete pod "${name}" --ignore-not-found --wait=false --force --grace-period=0
  done <<< "${rows}"
  mark_checkpoint longhorn_unlock_stale_replicaset_pods_cleared
}

restart_image_pull_backoff_pods_after_harbor_recovery() {
  require_cmd jq
  local pods namespace name
  if ! harbor_endpoint_is_ready 1; then
    warn "Skipping image-pull recovery sweep because Harbor registry API is still unhealthy."
    return 1
  fi

  pods="$(kubectl get pods -A -o json \
    | jq -r '.items[]
      | select(.metadata.namespace != "longhorn-system")
      | select(((.metadata.ownerReferences // []) | map(select(.kind=="ReplicaSet")) | length) > 0)
      | select(([
          (.status.containerStatuses[]?, .status.initContainerStatuses[]?)
          | select(((.state.waiting.reason // "") | test("ImagePullBackOff|ErrImagePull|CreateContainerError|RunContainerError|InvalidImageName")))
        ] | length) > 0)
      | [.metadata.namespace, .metadata.name] | @tsv' \
    | sort -u || true)"

  if [[ -z "${pods}" ]]; then
    log "image-pull-recovery=not-needed"
    return 0
  fi

  while IFS=$'\t' read -r namespace name; do
    [[ -z "${namespace}" || -z "${name}" ]] && continue
    warn "Restarting controller-owned pod ${namespace}/${name} after Harbor recovery to clear image-pull backoff."
    run kubectl -n "${namespace}" delete pod "${name}" --ignore-not-found --wait=false
  done <<< "${pods}"
  mark_checkpoint longhorn_unlock_image_pull_backoff_restarted
}

resume_deadlock_automation_after_core_recovery() {
  local gitea_endpoints
  if ! harbor_endpoint_is_ready 1; then
    warn "Keeping Flux reconcilers stopped because Harbor registry API is not healthy."
    return 1
  fi
  gitea_endpoints="$(kubectl -n gitea get endpoints gitea -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null || true)"
  if [[ -z "${gitea_endpoints//[[:space:]]/}" ]]; then
    warn "Keeping Flux reconcilers stopped because Gitea has no ready endpoints."
    return 1
  fi

  prepare_recovery_flux_critical_thaw
  if kubectl -n flux-system get deployment helm-controller >/dev/null 2>&1; then
    run kubectl -n flux-system scale deployment helm-controller --replicas=1
  fi
  if command -v flux >/dev/null 2>&1; then
    if [[ "${RECOVERY_FLUX_APPLY_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_APPLY_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
      patch_kustomization_suspend flux-system false
      run flux reconcile kustomization flux-system -n flux-system --timeout="${RECOVERY_FLUX_ROOT_APPLY_TIMEOUT}" || warn "flux-system Kustomization did not apply the recovery source revision before final suspension."
    fi
  fi
  patch_recovery_critical_flux_suspend false
  if [[ "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "1" || "${RECOVERY_FLUX_SUSPEND_BOOTSTRAP_KUSTOMIZATION}" == "true" ]]; then
    patch_kustomization_suspend flux-system true
  fi
  patch_recovery_optional_flux_suspend true
  annotate_flux_kustomizations "${RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS}" || true
  stabilize_recovery_flux_suspend || true
  mark_checkpoint longhorn_unlock_automation_resumed
}

restart_longhorn_image_pull_backoff_pods() {
  require_cmd jq
  local pods namespace name
  pods="$(kubectl -n longhorn-system get pods -o json \
    | jq -r '.items[]
      | select(([.status.containerStatuses[]?.state.waiting.reason] | map(select(. == "ImagePullBackOff" or . == "ErrImagePull")) | length) > 0)
      | select(.metadata.name | test("^(longhorn-manager-|longhorn-driver-deployer-|longhorn-ui-)"))
      | [.metadata.namespace, .metadata.name] | @tsv' || true)"
  while IFS=$'\t' read -r namespace name; do
    [[ -z "${namespace}" || -z "${name}" ]] && continue
    run kubectl -n "${namespace}" delete pod "${name}" --ignore-not-found --wait=false
  done <<< "${pods}"
}

terminating_running_pods_for_node() {
  local node="$1"
  local now
  now="$(date +%s)"
  kubectl get pods -A -o json \
    | jq -r --arg node "${node}" --argjson now "${now}" --argjson min_age "${STALE_TERMINATING_POD_SECONDS}" '
      .items[]
      | select(.spec.nodeName == $node)
      | select(.metadata.deletionTimestamp != null)
      | select(((.metadata.finalizers // []) | length) == 0)
      | (.metadata.deletionTimestamp | fromdateiso8601) as $deleted
      | select(($now - $deleted) >= $min_age)
      | ([(.status.initContainerStatuses[]?, .status.containerStatuses[]?) | select(.state.running != null)] | length) as $running
      | select($running > 0)
      | [.metadata.namespace, .metadata.name, ($running|tostring)] | @tsv' 2>/dev/null || true
}

stuck_terminating_runtime_cleanup_nodes() {
  local now
  now="$(date +%s)"
  kubectl get pods -A -o json \
    | jq -r --argjson now "${now}" --argjson min_age "${STALE_TERMINATING_POD_SECONDS}" '
      .items[]
      | select(.spec.nodeName != null)
      | select(.metadata.deletionTimestamp != null)
      | select(((.metadata.finalizers // []) | length) == 0)
      | (.metadata.deletionTimestamp | fromdateiso8601) as $deleted
      | select(($now - $deleted) >= $min_age)
      | select(([(.status.initContainerStatuses[]?, .status.containerStatuses[]?) | select(.state.running != null)] | length) > 0)
      | .spec.nodeName' 2>/dev/null \
    | sort -u
}

wait_for_node_ready() {
  local node="$1"
  local timeout_seconds="$2"
  local start now ready
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: wait for node ${node} Ready"
    return 0
  fi
  start="$(date +%s)"
  while true; do
    ready="$(kubectl get node "${node}" -o jsonpath='{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || true)"
    if [[ "${ready}" == "True" ]]; then
      log "node-ready=${node}"
      return 0
    fi
    now="$(date +%s)"
    if (( now - start >= timeout_seconds )); then
      warn "Timed out waiting for node ${node} to return Ready after runtime restart."
      return 1
    fi
    sleep 5
  done
}

wait_for_terminating_running_pods_to_clear() {
  local node="$1"
  local timeout_seconds="$2"
  local start now pods
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: wait for stuck terminating running pods to clear on ${node}"
    return 0
  fi
  start="$(date +%s)"
  while true; do
    pods="$(terminating_running_pods_for_node "${node}")"
    if [[ -z "${pods}" ]]; then
      log "stuck-terminating-runtime-pods-cleared=${node}"
      return 0
    fi
    now="$(date +%s)"
    if (( now - start >= timeout_seconds )); then
      warn "Stuck terminating pods with running containers remain on ${node}:"
      while IFS= read -r line; do
        [[ -n "${line}" ]] || continue
        warn "  ${line}"
      done <<< "${pods}"
      return 1
    fi
    sleep 5
  done
}

run_host_command_via_agent_restart_pod() {
  local node="$1"
  local host_command="$2"
  local pod encoded_command
  pod="$(kubectl -n "${NODE_HELPER_NAMESPACE}" get pods -l app=k3s-agent-restart --field-selector "spec.nodeName=${node},status.phase=Running" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
  if [[ -z "${pod}" ]]; then
    return 1
  fi
  encoded_command="$(printf '%s' "${host_command}" | base64 -w0)"
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: k3s-agent-restart exec via ${pod} on ${node}"
    return 0
  fi
  run kubectl -n "${NODE_HELPER_NAMESPACE}" exec "${pod}" -- /bin/sh -ceu "HOST_COMMAND=\$(printf '%s' '${encoded_command}' | base64 -d); nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu \"\${HOST_COMMAND}\""
}

schedule_host_service_restart_via_helper() {
  local node="$1"
  local service_name="$2"
  local delay_seconds="$3"
  local unit_name host_command
  unit_name="ananke-restart-${service_name}-$(date +%s)"
  host_command="/usr/bin/systemd-run --unit ${unit_name} --on-active=${delay_seconds}s /bin/sh -lc '/usr/bin/systemctl restart ${service_name} || /bin/systemctl restart ${service_name}'"
  if run_host_command_via_agent_restart_pod "${node}" "${host_command}"; then
    return 0
  fi
  if run_host_command_via_prewarm_pod "${node}" "${host_command}"; then
    return 0
  fi
  run_host_command_via_helper "${node}" "restart-${node}-${service_name}" 120 "${host_command}"
}

recover_stuck_terminating_node_runtime_pods_after_deadlock() {
  require_cmd jq
  if [[ "${RECOVERY_NODE_RUNTIME_RESTART_ENABLED}" != "1" && "${RECOVERY_NODE_RUNTIME_RESTART_ENABLED}" != "true" ]]; then
    warn "Skipping node runtime cleanup because RECOVERY_NODE_RUNTIME_RESTART_ENABLED=${RECOVERY_NODE_RUNTIME_RESTART_ENABLED}."
    return 0
  fi

  local nodes node ready worker control_plane restarted max_nodes restarted_nodes
  nodes="$(stuck_terminating_runtime_cleanup_nodes || true)"
  if [[ -z "${nodes}" ]]; then
    log "node-runtime-cleanup=not-needed"
    return 0
  fi

  max_nodes="${RECOVERY_NODE_RUNTIME_RESTART_MAX_NODES}"
  [[ "${max_nodes}" =~ ^[0-9]+$ ]] || max_nodes=1
  restarted=0
  restarted_nodes=""
  while IFS= read -r node; do
    [[ -n "${node}" ]] || continue
    if (( restarted >= max_nodes )); then
      warn "Node runtime cleanup limit reached (${max_nodes}); leaving remaining stuck nodes for a later Ananke pass."
      break
    fi
    if csv_has_value "${RECOVERY_NODE_RUNTIME_RESTART_DENYLIST}" "${node}"; then
      warn "Skipping node runtime cleanup on denylisted node ${node}."
      continue
    fi
    ready="$(kubectl get node "${node}" -o jsonpath='{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || true)"
    worker="$(kubectl get node "${node}" -o jsonpath='{.metadata.labels.node-role\.kubernetes\.io/worker}' 2>/dev/null || true)"
    control_plane="$(kubectl get node "${node}" -o jsonpath='{.metadata.labels.node-role\.kubernetes\.io/control-plane}' 2>/dev/null || true)"
    if [[ "${ready}" != "True" || "${worker}" != "true" || -n "${control_plane}" ]]; then
      warn "Skipping node runtime cleanup on ${node}; ready=${ready:-unknown} worker=${worker:-false} control_plane=${control_plane:-false}."
      continue
    fi

    warn "Cordoning ${node} and restarting only k3s-agent to clear stale terminating pods. Longhorn data-plane objects are not modified."
    run kubectl cordon "${node}"
    schedule_host_service_restart_via_helper "${node}" k3s-agent 5 || warn "Failed to schedule k3s-agent restart on ${node}."
    restarted=$((restarted + 1))
    restarted_nodes="${restarted_nodes}${node}"$'\n'
  done <<< "${nodes}"

  if (( restarted == 0 )); then
    log "node-runtime-cleanup=no-eligible-nodes"
    return 0
  fi

  sleep 15
  while IFS= read -r node; do
    [[ -n "${node}" ]] || continue
    wait_for_node_ready "${node}" "${RECOVERY_NODE_RUNTIME_RESTART_WAIT_SECONDS}" || true
    wait_for_terminating_running_pods_to_clear "${node}" "${RECOVERY_NODE_RUNTIME_RESTART_WAIT_SECONDS}" || true
  done <<< "${restarted_nodes}"
  mark_checkpoint longhorn_unlock_node_runtime_cleanup
}

wait_for_longhorn_endpoint() {
  local endpoint="$1"
  local timeout_seconds="$2"
  local start now addresses
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: wait for Longhorn endpoint ${endpoint}"
    return 0
  fi
  start="$(date +%s)"
  while true; do
    addresses="$(kubectl -n longhorn-system get endpoints "${endpoint}" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null || true)"
    if [[ -n "${addresses}" ]]; then
      log "longhorn-endpoint-${endpoint}=ready"
      return 0
    fi
    now="$(date +%s)"
    if (( now - start >= timeout_seconds )); then
      warn "Timed out waiting for Longhorn endpoint ${endpoint}."
      return 1
    fi
    sleep 5
  done
}

wait_for_longhorn_control_endpoints() {
  local rc=0
  wait_for_longhorn_endpoint longhorn-admission-webhook 180 || rc=1
  wait_for_longhorn_endpoint longhorn-conversion-webhook 180 || rc=1
  wait_for_longhorn_endpoint longhorn-backend 180 || rc=1
  wait_for_longhorn_endpoint longhorn-recovery-backend 180 || rc=1
  return "${rc}"
}

report_longhorn_unlock_status() {
  log "Longhorn manager DaemonSet:"
  kubectl -n longhorn-system get daemonset longhorn-manager \
    -o custom-columns=NAME:.metadata.name,DESIRED:.status.desiredNumberScheduled,CURRENT:.status.currentNumberScheduled,READY:.status.numberReady,UPDATED:.status.updatedNumberScheduled,AVAILABLE:.status.numberAvailable || true
  log "Longhorn manager pods:"
  kubectl -n longhorn-system get pods -l app=longhorn-manager \
    -o custom-columns=NAME:.metadata.name,READY:.status.containerStatuses[*].ready,STATUS:.status.phase,WAIT:.status.containerStatuses[*].state.waiting.reason,NODE:.spec.nodeName --sort-by=.spec.nodeName || true
  log "Longhorn instance managers:"
  kubectl -n longhorn-system get instancemanagers.longhorn.io \
    -o custom-columns=NAME:.metadata.name,STATE:.status.currentState,NODE:.spec.nodeID,IMAGE:.spec.image,TYPE:.spec.type --sort-by=.spec.nodeID || true
  log "Longhorn volume summary:"
  kubectl -n longhorn-system get volumes.longhorn.io -o json \
    | jq -r '.items | group_by(.status.state + "/" + (.status.robustness // "none"))[] | [(.[0].status.state + "/" + (.[0].status.robustness // "none")), length] | @tsv' 2>/dev/null \
    | sort || true
}

shutdown_namespace_excluded() {
  local ns="$1"
  [[ "${ns}" =~ ${SHUTDOWN_NAMESPACE_EXCLUDES_REGEX} ]]
}

startup_workload_namespace_excluded() {
  local ns="$1"
  [[ "${ns}" =~ ${STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX} ]]
}

best_effort_scale_down_apps() {
  local ns_list ns
  ns_list="$(kubectl get ns -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')"
  while IFS= read -r ns; do
    [[ -z "${ns}" ]] && continue
    if shutdown_namespace_excluded "${ns}"; then
      continue
    fi
    run_shell "kubectl -n ${ns} scale deployment --all --replicas=0 || true"
    run_shell "kubectl -n ${ns} scale statefulset --all --replicas=0 || true"
  done <<< "${ns_list}"
}

save_workload_replica_snapshot() {
  local rows line ns kind name replicas
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: save workload replica snapshot to ${REPLICA_SNAPSHOT_FILE}"
    return 0
  fi
  rows="$(
    {
      kubectl get deployment -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\tdeployment\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\n"}{end}' 2>/dev/null || true
      kubectl get statefulset -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\tstatefulset\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\n"}{end}' 2>/dev/null || true
    } | sed '/^[[:space:]]*$/d'
  )"
  mkdir -p "$(dirname "${REPLICA_SNAPSHOT_FILE}")"
  : > "${REPLICA_SNAPSHOT_FILE}"
  while IFS=$'\t' read -r ns kind name replicas; do
    [[ -n "${ns}" && -n "${kind}" && -n "${name}" && -n "${replicas}" ]] || continue
    shutdown_namespace_excluded "${ns}" && continue
    [[ "${replicas}" =~ ^[0-9]+$ ]] || continue
    (( replicas > 0 )) || continue
    printf '%s\t%s\t%s\t%s\n' "${ns}" "${kind}" "${name}" "${replicas}" >> "${REPLICA_SNAPSHOT_FILE}"
  done <<< "${rows}"
  log "replica-snapshot-file=${REPLICA_SNAPSHOT_FILE}"
  log "replica-snapshot-count=$(replica_snapshot_count)"
}

replica_snapshot_count() {
  if [[ -f "${REPLICA_SNAPSHOT_FILE}" ]]; then
    wc -l < "${REPLICA_SNAPSHOT_FILE}" | tr -d ' '
  else
    printf '0'
  fi
}

restore_workload_replica_snapshot() {
  local ns kind name desired current
  if [[ "${RECOVERY_PENDING}" -ne 1 ]]; then
    log "Skipping replica restore because recovery_pending=0."
    return 0
  fi
  if [[ ! -f "${REPLICA_SNAPSHOT_FILE}" ]]; then
    warn "Replica snapshot file not found at ${REPLICA_SNAPSHOT_FILE}; skipping replica restore."
    return 0
  fi
  while IFS=$'\t' read -r ns kind name desired; do
    [[ -n "${ns}" && -n "${kind}" && -n "${name}" && -n "${desired}" ]] || continue
    [[ "${desired}" =~ ^[0-9]+$ ]] || continue
    (( desired > 0 )) || continue
    current="$(kubectl -n "${ns}" get "${kind}" "${name}" -o jsonpath='{.spec.replicas}' 2>/dev/null || true)"
    [[ -n "${current}" ]] || continue
    [[ "${current}" =~ ^[0-9]+$ ]] || current=0
    if (( current == desired )); then
      continue
    fi
    run kubectl -n "${ns}" scale "${kind}" "${name}" --replicas="${desired}"
  done < "${REPLICA_SNAPSHOT_FILE}"
  mark_checkpoint startup_replicas_restored
}

restore_zero_scaled_helm_workloads() {
  local rows ns kind name
  local restored=0
  rows="$(
    {
      kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\.helm\\.sh/release-name --no-headers 2>/dev/null \
        | awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "<none>" {printf "%s\tdeployment\t%s\n", $1, $2}'
      kubectl get statefulset -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\.helm\\.sh/release-name --no-headers 2>/dev/null \
        | awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "<none>" {printf "%s\tstatefulset\t%s\n", $1, $2}'
    } | sed '/^[[:space:]]*$/d'
  )"
  while IFS=$'\t' read -r ns kind name; do
    [[ -n "${ns}" && -n "${kind}" && -n "${name}" ]] || continue
    startup_workload_namespace_excluded "${ns}" && continue
    if [[ -n "${STARTUP_IGNORE_WORKLOADS_REGEX}" ]] && [[ "${ns}/${name}" =~ ${STARTUP_IGNORE_WORKLOADS_REGEX} ]]; then
      continue
    fi
    warn "Auto-heal: restoring zero-scaled Helm workload ${ns}/${kind}/${name} to replicas=1."
    run kubectl -n "${ns}" scale "${kind}" "${name}" --replicas=1
    restored=$((restored + 1))
  done <<< "${rows}"
  if (( restored > 0 )); then
    log "Auto-heal: restored ${restored} zero-scaled Helm workloads."
    mark_checkpoint startup_zero_scaled_helm_restored
  else
    log "Auto-heal: no zero-scaled Helm workloads detected."
  fi
}

list_unhealthy_workloads() {
  local rows line ns name desired ready available
  rows="$(kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas --no-headers 2>/dev/null || true)"
  while IFS= read -r line; do
    [[ -n "${line}" ]] || continue
    ns="$(awk '{print $1}' <<< "${line}")"
    name="$(awk '{print $2}' <<< "${line}")"
    desired="$(awk '{print $3}' <<< "${line}")"
    ready="$(awk '{print $4}' <<< "${line}")"
    available="$(awk '{print $5}' <<< "${line}")"
    startup_workload_namespace_excluded "${ns}" && continue
    [[ -n "${STARTUP_IGNORE_WORKLOADS_REGEX}" && "${ns}/${name}" =~ ${STARTUP_IGNORE_WORKLOADS_REGEX} ]] && continue
    [[ "${desired}" =~ ^[0-9]+$ ]] || desired=0
    [[ "${ready}" =~ ^[0-9]+$ ]] || ready=0
    [[ "${available}" =~ ^[0-9]+$ ]] || available=0
    (( desired > 0 )) || continue
    if (( ready < desired || available < desired )); then
      printf '%s/deployment/%s|ready=%s available=%s desired=%s\n' "${ns}" "${name}" "${ready}" "${available}" "${desired}"
    fi
  done <<< "${rows}"

  rows="$(kubectl get statefulset -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas --no-headers 2>/dev/null || true)"
  while IFS= read -r line; do
    [[ -n "${line}" ]] || continue
    ns="$(awk '{print $1}' <<< "${line}")"
    name="$(awk '{print $2}' <<< "${line}")"
    desired="$(awk '{print $3}' <<< "${line}")"
    ready="$(awk '{print $4}' <<< "${line}")"
    startup_workload_namespace_excluded "${ns}" && continue
    [[ -n "${STARTUP_IGNORE_WORKLOADS_REGEX}" && "${ns}/${name}" =~ ${STARTUP_IGNORE_WORKLOADS_REGEX} ]] && continue
    [[ "${desired}" =~ ^[0-9]+$ ]] || desired=0
    [[ "${ready}" =~ ^[0-9]+$ ]] || ready=0
    (( desired > 0 )) || continue
    if (( ready < desired )); then
      printf '%s/statefulset/%s|ready=%s desired=%s\n' "${ns}" "${name}" "${ready}" "${desired}"
    fi
  done <<< "${rows}"
}

wait_for_startup_workloads_ready() {
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: skipping startup workload readiness checks"
    return 0
  fi
  local start now unhealthy
  start="$(date +%s)"
  while true; do
    unhealthy="$(list_unhealthy_workloads || true)"
    if [[ -z "${unhealthy}" ]]; then
      log "startup-workloads=all-ready"
      return 0
    fi
    warn "startup-workloads-not-ready:"
    while IFS= read -r line; do
      [[ -n "${line}" ]] || continue
      warn "  ${line}"
    done <<< "${unhealthy}"
    now="$(date +%s)"
    if (( now - start >= STARTUP_WORKLOAD_TIMEOUT_SECONDS )); then
      die "Timed out waiting for startup workloads Ready after ${STARTUP_WORKLOAD_TIMEOUT_SECONDS}s."
    fi
    sleep "${STARTUP_WORKLOAD_POLL_SECONDS}"
  done
}

discover_workers_csv() {
  kubectl get nodes \
    -o 'custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\.kubernetes\.io/control-plane,MASTER:.metadata.labels.node-role\.kubernetes\.io/master,READY:.status.conditions[?(@.type=="Ready")].status' \
    --no-headers \
    | awk '$2=="<none>" && $3=="<none>" && $4=="True" {print $1}' \
    | paste -sd, -
}

node_is_ready() {
  local node="$1"
  [[ -n "${node}" ]] || return 1
  local ready
  ready="$(kubectl get node "${node}" -o jsonpath='{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || true)"
  [[ "${ready}" == "True" ]]
}

select_ready_arm64_worker() {
  local rows node
  rows="$(kubectl get nodes -o 'custom-columns=NAME:.metadata.name,ARCH:.metadata.labels.kubernetes\.io/arch,WORKER:.metadata.labels.node-role\.kubernetes\.io/worker,HARDWARE:.metadata.labels.hardware,READY:.status.conditions[?(@.type=="Ready")].status' --no-headers 2>/dev/null || true)"
  [[ -n "${rows}" ]] || return 1
  node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $4=="rpi5" && $5=="True" {print $1; exit}')"
  if [[ -n "${node}" ]]; then
    printf '%s' "${node}"
    return 0
  fi
  node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $4=="rpi4" && $5=="True" {print $1; exit}')"
  if [[ -n "${node}" ]]; then
    printf '%s' "${node}"
    return 0
  fi
  node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $5=="True" {print $1; exit}')"
  if [[ -n "${node}" ]]; then
    printf '%s' "${node}"
    return 0
  fi
  return 1
}

discover_harbor_pinned_node() {
  kubectl -n harbor get helmrelease harbor \
    -o jsonpath='{range .spec.values..nodeSelector}{.kubernetes\.io/hostname}{"\n"}{end}' 2>/dev/null \
    | sed '/^[[:space:]]*$/d' \
    | sort -u \
    | head -n 1
}

ensure_harbor_target_node() {
  if node_is_ready "${HARBOR_TARGET_NODE}"; then
    return 0
  fi
  local fallback pinned
  pinned="$(discover_harbor_pinned_node || true)"
  if node_is_ready "${pinned}"; then
    if [[ -n "${HARBOR_TARGET_NODE}" ]]; then
      warn "Configured harbor target node '${HARBOR_TARGET_NODE}' is not Ready; using live Harbor pin '${pinned}' instead."
    else
      log "harbor-target-node discovered from live HelmRelease: ${pinned}"
    fi
    HARBOR_TARGET_NODE="${pinned}"
    return 0
  fi
  fallback="$(select_ready_arm64_worker || true)"
  [[ -n "${fallback}" ]] || die "No Ready arm64 worker available for Harbor bootstrap target."
  if [[ -n "${HARBOR_TARGET_NODE}" ]]; then
    warn "Configured harbor target node '${HARBOR_TARGET_NODE}' is not Ready; using '${fallback}' instead."
  else
    log "harbor-target-node auto-selected: ${fallback}"
  fi
  HARBOR_TARGET_NODE="${fallback}"
}

ensure_harbor_host_label() {
  [[ -n "${HARBOR_TARGET_NODE}" ]] || die "Harbor target node is not set."
  local labeled node
  labeled="$(kubectl get nodes -l "${HARBOR_HOST_LABEL_KEY}=true" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
  while IFS= read -r node; do
    [[ -z "${node}" ]] && continue
    [[ "${node}" == "${HARBOR_TARGET_NODE}" ]] && continue
    run kubectl label node "${node}" "${HARBOR_HOST_LABEL_KEY}-"
  done <<< "${labeled}"
  run kubectl label node "${HARBOR_TARGET_NODE}" "${HARBOR_HOST_LABEL_KEY}=true" --overwrite
}

as_array_from_csv() {
  local csv="$1"
  local out_var="$2"
  local old_ifs="${IFS}"
  IFS=',' read -r -a _tmp <<< "${csv}"
  IFS="${old_ifs}"
  eval "${out_var}"'=( "${_tmp[@]}" )'
}

best_effort_drain_workers() {
  local timeout_seconds="$1"
  shift || true
  local workers=("$@")
  local node
  for node in "${workers[@]}"; do
    [[ -z "${node}" ]] && continue
    run kubectl cordon "${node}"
    if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s"; then
      continue
    fi
    warn "Gentle drain timed out for ${node}; retrying with --force."
    if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force"; then
      continue
    fi
    warn "Force drain timed out for ${node}; final attempt with --disable-eviction."
    run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force --disable-eviction || true"
  done
}

wait_for_rollout() {
  local namespace="$1"
  local kind="$2"
  local name="$3"
  local timeout="$4"
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: kubectl -n ${namespace} rollout status ${kind}/${name} --timeout=${timeout}"
    return 0
  fi
  kubectl -n "${namespace}" rollout status "${kind}/${name}" --timeout="${timeout}"
}

check_ingress_stack() {
  kubectl get ingressclass traefik >/dev/null
  wait_for_rollout traefik deployment traefik 5m
}

check_longhorn_stack() {
  wait_for_rollout longhorn-system daemonset longhorn-manager 10m
  wait_for_rollout longhorn-system deployment longhorn-ui 10m
}

check_vault_stack() {
  wait_for_rollout vault statefulset vault 10m
  if [[ "${EXECUTE}" -eq 1 ]]; then
    kubectl -n vault exec vault-0 -- sh -ceu 'VAULT_ADDR=http://127.0.0.1:8200 vault status >/dev/null'
  fi
}

check_postgres_stack() {
  wait_for_rollout postgres statefulset postgres 10m
  if [[ "${EXECUTE}" -eq 1 ]]; then
    kubectl -n postgres exec postgres-0 -c postgres -- sh -ceu 'pg_isready -h 127.0.0.1 -p 5432 >/dev/null'
  fi
}

check_gitea_stack() {
  wait_for_rollout gitea deployment gitea 10m
}

check_harbor_stack() {
  wait_for_rollout harbor statefulset harbor-redis 10m
  wait_for_rollout harbor deployment harbor-core 10m
  wait_for_rollout harbor deployment harbor-jobservice 10m
  wait_for_rollout harbor deployment harbor-portal 10m
  wait_for_rollout harbor deployment harbor-registry 10m
}

harbor_registry_response_valid() {
  local code="$1"
  local headers_file="$2"
  local body_file="$3"
  local content_type
  case "${code}" in
    200|401) ;;
    *) return 1 ;;
  esac
  content_type="$(awk 'BEGIN{IGNORECASE=1} /^content-type:/ {print tolower($0); exit}' "${headers_file}" 2>/dev/null || true)"
  if [[ "${content_type}" == *"text/html"* ]]; then
    return 1
  fi
  if grep -Eiq '^docker-distribution-api-version:' "${headers_file}" 2>/dev/null; then
    return 0
  fi
  if [[ "${code}" == "401" ]] && grep -Eiq 'unauthorized|authentication required' "${body_file}" 2>/dev/null; then
    return 0
  fi
  return 1
}

harbor_endpoint_is_ready() {
  local quiet="${1:-0}"
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/"
    return 0
  fi
  local headers_file body_file code rc content_type
  headers_file="$(mktemp)"
  body_file="$(mktemp)"
  rc=0
  code="$(curl -ksS --max-time "${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS}" -D "${headers_file}" -o "${body_file}" -w '%{http_code}' https://registry.bstein.dev/v2/ || rc=$?)"
  content_type="$(awk 'BEGIN{IGNORECASE=1} /^content-type:/ {print tolower($0); exit}' "${headers_file}" 2>/dev/null || true)"
  if (( rc == 0 )) && harbor_registry_response_valid "${code}" "${headers_file}" "${body_file}"; then
    [[ "${quiet}" == "1" ]] || log "harbor-endpoint=http-${code} registry-api=true"
    rm -f "${headers_file}" "${body_file}"
    return 0
  fi
  [[ "${quiet}" == "1" ]] || warn "Harbor registry API check failed: http=${code:-unknown} content-type=${content_type:-unknown} rc=${rc}"
  rm -f "${headers_file}" "${body_file}"
  return 1
}

check_harbor_endpoint() {
  if ! harbor_endpoint_is_ready 0; then
    die "Harbor endpoint is not serving the registry API."
  fi
}

wait_for_pod_phase() {
  local namespace="$1"
  local pod="$2"
  local expected_phase="$3"
  local timeout_seconds="$4"
  local start now phase
  start="$(date +%s)"
  while true; do
    phase="$(kubectl -n "${namespace}" get pod "${pod}" -o jsonpath='{.status.phase}' 2>/dev/null || true)"
    if [[ "${phase}" == "${expected_phase}" ]]; then
      return 0
    fi
    if [[ "${phase}" == "Failed" ]]; then
      return 1
    fi
    now="$(date +%s)"
    if (( now - start >= timeout_seconds )); then
      return 1
    fi
    sleep 2
  done
}

harbor_is_ready() {
  kubectl -n harbor get deploy harbor-core harbor-jobservice harbor-portal harbor-registry >/dev/null 2>&1 || return 1
  harbor_endpoint_is_ready 1
}

run_harbor_pull_canary() {
  local pod="ananke-harbor-canary"
  local canary_node="${HARBOR_CANARY_NODE}"
  if ! node_is_ready "${canary_node}"; then
    ensure_harbor_target_node
    canary_node="${HARBOR_TARGET_NODE}"
    if [[ -n "${HARBOR_CANARY_NODE}" ]]; then
      warn "Configured harbor canary node '${HARBOR_CANARY_NODE}' is not Ready; using '${canary_node}'."
    fi
    HARBOR_CANARY_NODE="${canary_node}"
  fi
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${canary_node}"
    return 0
  fi
  timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
  cat <<CANARY | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: ${pod}
  namespace: ${NODE_HELPER_NAMESPACE}
spec:
  nodeName: ${canary_node}
  restartPolicy: Never
  imagePullSecrets:
    - name: ${REGISTRY_PULL_SECRET}
  tolerations:
    - operator: Exists
  containers:
    - name: canary
      image: ${HARBOR_CANARY_IMAGE}
      imagePullPolicy: Always
      command: ["sh", "-ceu", "echo harbor-canary-ok"]
CANARY
  if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded 180; then
    kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true
    timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true
    timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
    return 1
  fi
  timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true
  timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
}

run_helper_pod() {
  local node="$1"
  local purpose="$2"
  local timeout_seconds="$3"
  local script_content="$4"
  local pod="ananke-$(sanitize_name "${purpose}")-$(date +%H%M%S)"
  local encoded_script
  encoded_script="$(printf '%s' "${script_content}" | base64 -w0)"

  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: helper pod ${pod} on ${node} for ${purpose}"
    return 0
  fi

  cat <<POD | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: ${pod}
  namespace: ${NODE_HELPER_NAMESPACE}
spec:
  nodeName: ${node}
  restartPolicy: Never
  serviceAccountName: ${NODE_HELPER_SERVICE_ACCOUNT}
  imagePullSecrets:
    - name: ${REGISTRY_PULL_SECRET}
  hostNetwork: true
  hostPID: true
  tolerations:
    - operator: Exists
  containers:
    - name: helper
      image: ${NODE_HELPER_IMAGE}
      imagePullPolicy: IfNotPresent
      securityContext:
        privileged: true
      command: ["/bin/bash", "-ceu"]
      args:
        - |
          printf '%s' '${encoded_script}' | base64 -d >/tmp/ananke-step.sh
          chmod +x /tmp/ananke-step.sh
          /tmp/ananke-step.sh
POD

  if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded "${timeout_seconds}"; then
    kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true
    timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true
    timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
    return 1
  fi
  timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true
  timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
}

hostroot_pod_for_node() {
  local node="$1"
  kubectl -n "${NODE_HELPER_NAMESPACE}" get pods \
    -l app=node-image-sweeper \
    --field-selector "spec.nodeName=${node},status.phase=Running" \
    -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true
}

run_hostroot_pod_script() {
  local node="$1"
  local purpose="$2"
  local timeout_seconds="$3"
  local script_content="$4"
  local pod encoded_script
  pod="$(hostroot_pod_for_node "${node}")"
  [[ -n "${pod}" ]] || return 1
  encoded_script="$(printf '%s' "${script_content}" | base64 -w0)"
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: hostroot pod ${pod} on ${node} for ${purpose}"
    return 0
  fi
  timeout "${timeout_seconds}" kubectl -n "${NODE_HELPER_NAMESPACE}" exec "${pod}" -- /bin/sh -ceu "printf '%s' '${encoded_script}' | base64 -d | chroot /host /bin/sh -seu"
}

run_hostroot_pod_bundle_import() {
  local node="$1"
  local timeout_seconds="$2"
  local images_text="$3"
  local pod refresh_script verify_script encoded_script
  pod="$(hostroot_pod_for_node "${node}")"
  [[ -n "${pod}" ]] || return 1
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: stream ${HARBOR_BUNDLE_FILE} through hostroot pod ${pod} on ${node}"
    return 0
  fi
  if [[ "${REFRESH_BOOTSTRAP_IMAGE_ALIASES}" == "1" ]]; then
    refresh_script=$(cat <<SCRIPT
set -eu
while IFS= read -r image; do
  [ -z "\${image}" ] && continue
  /usr/local/bin/k3s ctr images rm "\${image}" >/dev/null 2>&1 || true
done <<'IMAGES'
${images_text}
IMAGES
SCRIPT
)
    encoded_script="$(printf '%s' "${refresh_script}" | base64 -w0)"
    timeout 120 kubectl -n "${NODE_HELPER_NAMESPACE}" exec "${pod}" -- /bin/sh -ceu "printf '%s' '${encoded_script}' | base64 -d | chroot /host /bin/sh -seu"
  fi
  timeout "${timeout_seconds}" kubectl -n "${NODE_HELPER_NAMESPACE}" exec -i "${pod}" -- \
    chroot /host /bin/sh -ceu '/usr/bin/zstd -dc | /usr/local/bin/k3s ctr images import -' < "${HARBOR_BUNDLE_FILE}"
  verify_script=$(cat <<SCRIPT
set -eu
while IFS= read -r image; do
  [ -z "\${image}" ] && continue
  /usr/local/bin/k3s ctr images ls | awk '{print \$1}' | grep -Fx "\${image}" >/dev/null
done <<'IMAGES'
${images_text}
IMAGES
SCRIPT
)
  encoded_script="$(printf '%s' "${verify_script}" | base64 -w0)"
  timeout 120 kubectl -n "${NODE_HELPER_NAMESPACE}" exec "${pod}" -- /bin/sh -ceu "printf '%s' '${encoded_script}' | base64 -d | chroot /host /bin/sh -seu"
}

run_host_command_via_helper() {
  local node="$1"
  local purpose="$2"
  local timeout_seconds="$3"
  local host_command="$4"
  local encoded_command
  encoded_command="$(printf '%s' "${host_command}" | base64 -w0)"
  local script_content
  script_content=$(cat <<SCRIPT
set -euo pipefail
HOST_COMMAND="\$(printf '%s' '${encoded_command}' | base64 -d)"
nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu "\${HOST_COMMAND}"
SCRIPT
)
  run_helper_pod "${node}" "${purpose}" "${timeout_seconds}" "${script_content}"
}

run_host_command_via_prewarm_pod() {
  local node="$1"
  local host_command="$2"
  local pod encoded_command
  pod="$(kubectl -n "${NODE_HELPER_NAMESPACE}" get pods -l app="${NODE_HELPER_PREWARM_DS}" --field-selector "spec.nodeName=${node}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
  if [[ -z "${pod}" ]]; then
    return 1
  fi
  encoded_command="$(printf '%s' "${host_command}" | base64 -w0)"
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: helper exec via ${pod} on ${node}"
    return 0
  fi
  run kubectl -n "${NODE_HELPER_NAMESPACE}" exec "${pod}" -- /bin/bash -ceu "HOST_COMMAND=\$(printf '%s' '${encoded_command}' | base64 -d); nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu \"\${HOST_COMMAND}\""
}

schedule_host_shutdown_via_helper() {
  local node="$1"
  local service_name="$2"
  local delay_seconds="$3"
  local host_command
  host_command="/usr/bin/systemd-run --unit ananke-shutdown-${service_name} --on-active=${delay_seconds}s /bin/sh -lc '/usr/bin/systemctl stop ${service_name} || true; /usr/bin/systemctl poweroff || true'"
  if run_host_command_via_prewarm_pod "${node}" "${host_command}"; then
    return 0
  fi
  run_host_command_via_helper "${node}" "shutdown-${node}-${service_name}" 120 "${host_command}"
}

schedule_host_service_stop_via_helper() {
  local node="$1"
  local service_name="$2"
  local delay_seconds="$3"
  local host_command
  host_command="/usr/bin/systemd-run --unit ananke-stop-${service_name} --on-active=${delay_seconds}s /bin/sh -lc '/usr/bin/systemctl stop ${service_name} || true'"
  if run_host_command_via_prewarm_pod "${node}" "${host_command}"; then
    return 0
  fi
  run_host_command_via_helper "${node}" "stop-${node}-${service_name}" 120 "${host_command}"
}

prewarm_node_helper_image() {
  local name="${NODE_HELPER_PREWARM_DS}"
  local ready_nodes node
  local node_affinity_block=""
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: prewarm ${NODE_HELPER_IMAGE} via temporary DaemonSet"
    return 0
  fi
  ready_nodes="$(kubectl get nodes -o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status' --no-headers 2>/dev/null | awk '$2=="True" {print $1}' || true)"
  if [[ -n "${ready_nodes}" ]]; then
    node_affinity_block=$'      affinity:\n        nodeAffinity:\n          requiredDuringSchedulingIgnoredDuringExecution:\n            nodeSelectorTerms:\n            - matchExpressions:\n              - key: kubernetes.io/hostname\n                operator: In\n                values:'
    while IFS= read -r node; do
      [[ -z "${node}" ]] && continue
      node_affinity_block+=$'\n'"                - ${node}"
    done <<< "${ready_nodes}"
    log "node-helper-prewarm-targets=$(printf '%s' "${ready_nodes}" | paste -sd, -)"
  else
    warn "Unable to detect Ready nodes for prewarm targeting; continuing without node affinity."
  fi
  cat <<DS | kubectl apply -f -
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: ${name}
  namespace: ${NODE_HELPER_NAMESPACE}
spec:
  selector:
    matchLabels:
      app: ${name}
  template:
    metadata:
      labels:
        app: ${name}
    spec:
      imagePullSecrets:
        - name: ${REGISTRY_PULL_SECRET}
${node_affinity_block}
      tolerations:
        - operator: Exists
      containers:
        - name: helper
          image: ${NODE_HELPER_IMAGE}
          imagePullPolicy: IfNotPresent
          command: ["/bin/sh", "-ceu", "sleep 300"]
DS
  local i desired ready
  for i in $(seq 1 90); do
    desired="$(kubectl -n "${NODE_HELPER_NAMESPACE}" get ds "${name}" -o jsonpath='{.status.desiredNumberScheduled}' 2>/dev/null || echo 0)"
    ready="$(kubectl -n "${NODE_HELPER_NAMESPACE}" get ds "${name}" -o jsonpath='{.status.numberReady}' 2>/dev/null || echo 0)"
    [[ -n "${desired}" ]] || desired=0
    [[ -n "${ready}" ]] || ready=0
    if [[ "${desired}" != "0" && "${desired}" == "${ready}" ]]; then
      log "node-helper-prewarm=${ready}/${desired}"
      if [[ "${KEEP_PREWARM_DAEMONSET}" -eq 0 ]]; then
        kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${name}" --ignore-not-found >/dev/null 2>&1 || true
      else
        log "Keeping ${name} DaemonSet running for shutdown helper exec path."
      fi
      return 0
    fi
    sleep 2
  done
  kubectl -n "${NODE_HELPER_NAMESPACE}" describe ds "${name}" >&2 || true
  kubectl -n "${NODE_HELPER_NAMESPACE}" get pods -l app="${name}" >&2 || true
  kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${name}" --ignore-not-found >/dev/null 2>&1 || true
  die "Timed out prewarming node helper image ${NODE_HELPER_IMAGE}"
}

cleanup_prewarm_daemonset() {
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: cleanup ${NODE_HELPER_PREWARM_DS} DaemonSet"
    return 0
  fi
  kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${NODE_HELPER_PREWARM_DS}" --ignore-not-found >/dev/null 2>&1 || true
}

start_bundle_server() {
  [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Bootstrap bundle not found at ${HARBOR_BUNDLE_FILE}"
  require_cmd python3
  local bundle_dir bundle_name
  bundle_dir="$(dirname "${HARBOR_BUNDLE_FILE}")"
  bundle_name="$(basename "${HARBOR_BUNDLE_FILE}")"
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: serve ${bundle_name} from ${bundle_dir} on port ${BUNDLE_HTTP_PORT}"
    return 0
  fi
  python3 -m http.server "${BUNDLE_HTTP_PORT}" --bind 0.0.0.0 --directory "${bundle_dir}" </dev/null >/tmp/ananke-bundle-server.log 2>&1 &
  BUNDLE_SERVER_PID=$!
  for _ in $(seq 1 20); do
    if curl -fsS "http://127.0.0.1:${BUNDLE_HTTP_PORT}/${bundle_name}" >/dev/null 2>&1; then
      return 0
    fi
    sleep 1
  done
  die "Temporary bundle server did not become ready; see /tmp/ananke-bundle-server.log"
}

stop_bundle_server() {
  if [[ -n "${BUNDLE_SERVER_PID}" ]]; then
    kill "${BUNDLE_SERVER_PID}" >/dev/null 2>&1 || true
    for _ in $(seq 1 10); do
      kill -0 "${BUNDLE_SERVER_PID}" >/dev/null 2>&1 || break
      sleep 1
    done
    BUNDLE_SERVER_PID=""
  fi
}
trap stop_bundle_server EXIT

control_host_ip() {
  local ip_addr
  if command -v hostname >/dev/null 2>&1; then
    ip_addr="$(hostname -I 2>/dev/null | awk '{print $1}')"
    if [[ -n "${ip_addr}" ]]; then
      printf '%s\n' "${ip_addr}"
      return 0
    fi
  fi
  if command -v ip >/dev/null 2>&1; then
    ip_addr="$(ip -4 route get 1.1.1.1 2>/dev/null | awk '{for (i=1; i<=NF; i++) if ($i=="src") {print $(i+1); exit}}')"
    if [[ -n "${ip_addr}" ]]; then
      printf '%s\n' "${ip_addr}"
      return 0
    fi
  fi
  die "Unable to determine control host IP; install hostname or iproute2."
}

bootstrap_images_text() {
  [[ -f "${BOOTSTRAP_IMAGES_FILE}" ]] || die "Bootstrap image list not found at ${BOOTSTRAP_IMAGES_FILE}"
  sed '/^[[:space:]]*#/d;/^[[:space:]]*$/d' "${BOOTSTRAP_IMAGES_FILE}"
}

longhorn_unlock_images_text() {
  [[ -f "${LONGHORN_UNLOCK_IMAGES_FILE}" ]] || die "Longhorn unlock image list not found at ${LONGHORN_UNLOCK_IMAGES_FILE}"
  sed '/^[[:space:]]*#/d;/^[[:space:]]*$/d' "${LONGHORN_UNLOCK_IMAGES_FILE}"
}

ssh_host_for_node() {
  local node="$1"
  case "${node}" in
    titan-23) printf '%s\n' "oceanus" ;;
    *) printf '%s\n' "${node}" ;;
  esac
}

ssh_recovery_opts() {
  printf '%s\n' \
    -o BatchMode=yes \
    -o ConnectTimeout=10 \
    -o StrictHostKeyChecking=accept-new \
    -o UserKnownHostsFile="${LONGHORN_UNLOCK_SSH_KNOWN_HOSTS}"
}

run_ssh_longhorn_bundle_import() {
  local node="$1"
  local bundle_file="$2"
  local images_text="$3"
  local host remote_bundle host_script
  local -a ssh_opts
  [[ -f "${bundle_file}" ]] || die "Longhorn unlock bundle not found at ${bundle_file}"
  host="$(ssh_host_for_node "${node}")"
  remote_bundle="/tmp/$(basename "${bundle_file}")"
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: scp ${bundle_file} to ${host}:${remote_bundle} and import into k3s containerd"
    return 0
  fi
  mapfile -t ssh_opts < <(ssh_recovery_opts)
  log "ssh-image-seed-node=${node} host=${host} bundle=$(basename "${bundle_file}")"
  scp "${ssh_opts[@]}" "${bundle_file}" "${host}:${remote_bundle}"
  host_script=$(cat <<SCRIPT
set -eu
bundle='${remote_bundle}'
if [ ! -s "\${bundle}" ]; then
  echo "bundle missing or empty: \${bundle}" >&2
  exit 1
fi
while IFS= read -r image; do
  [ -z "\${image}" ] && continue
  /usr/bin/timeout 60 /usr/local/bin/k3s crictl rmi "\${image}" >/dev/null 2>&1 || true
  /usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images rm "\${image}" >/dev/null 2>&1 || true
done <<'IMAGES'
${images_text}
IMAGES
/usr/bin/zstd -dc "\${bundle}" | /usr/bin/timeout 1800 /usr/local/bin/k3s ctr -n k8s.io images import --platform linux/${BOOTSTRAP_BUNDLE_ARCH} -
while IFS= read -r image; do
  [ -z "\${image}" ] && continue
  repo="\${image%:*}"
  digest_ref="\$(/usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images ls -q | grep -E "^\${repo}@sha256:" | head -n 1 || true)"
  if [ -n "\${digest_ref}" ]; then
    /usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images tag --force "\${digest_ref}" "\${image}" >/dev/null 2>&1 || true
  fi
  /usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images ls -q | grep -Fx "\${image}" >/dev/null
done <<'IMAGES'
${images_text}
IMAGES
SCRIPT
)
  if ! run_hostroot_pod_script "${node}" "longhorn-unlock-import-${node}" 1800 "${host_script}"; then
    warn "Hostroot import failed on ${node}; SSH staging succeeded but no sudo-capable remote import was attempted."
    return 1
  fi
}

longhorn_manager_image_pull_nodes() {
  kubectl -n longhorn-system get pods -l app=longhorn-manager -o json \
    | jq -r --arg image "${LONGHORN_MANAGER_IMAGE}" '.items[]
      | select(.spec.nodeName != null)
      | select([.status.containerStatuses[]?.state.waiting.reason]
          | map(select(. == "ImagePullBackOff" or . == "ErrImagePull")) | length > 0)
      | select([.spec.containers[]?.image] | index($image))
      | .spec.nodeName' 2>/dev/null \
    | sort -u
}

repair_longhorn_manager_cache_node() {
  local node="$1"
  local host remote_bundle host_script
  local -a ssh_opts
  [[ -f "${LONGHORN_MANAGER_CACHE_BUNDLE_FILE}" ]] || die "Longhorn manager cache bundle missing at ${LONGHORN_MANAGER_CACHE_BUNDLE_FILE}."
  host="$(ssh_host_for_node "${node}")"
  remote_bundle="/tmp/$(basename "${LONGHORN_MANAGER_CACHE_BUNDLE_FILE}")"
  if [[ "${EXECUTE}" -eq 0 ]]; then
    log "DRY-RUN: repair ${LONGHORN_MANAGER_IMAGE} cache on ${node} using ${LONGHORN_MANAGER_CACHE_BUNDLE_FILE}"
    return 0
  fi
  mapfile -t ssh_opts < <(ssh_recovery_opts)
  log "longhorn-manager-cache-repair-node=${node} host=${host}"
  scp "${ssh_opts[@]}" "${LONGHORN_MANAGER_CACHE_BUNDLE_FILE}" "${host}:${remote_bundle}"
  host_script=$(cat <<SCRIPT
set -eu
image='${LONGHORN_MANAGER_IMAGE}'
bundle='${remote_bundle}'
if [ ! -s "\${bundle}" ]; then
  echo "manager cache bundle missing or empty: \${bundle}" >&2
  exit 1
fi
/usr/bin/timeout 60 /usr/local/bin/k3s crictl rmi "\${image}" >/dev/null 2>&1 || true
/usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images rm "\${image}" >/dev/null 2>&1 || true
/usr/bin/timeout 600 /usr/local/bin/k3s ctr -n k8s.io images import --platform linux/${BOOTSTRAP_BUNDLE_ARCH} "\${bundle}"
repo="\${image%:*}"
digest_ref="\$(/usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images ls -q | grep -E "^\${repo}@sha256:" | head -n 1 || true)"
if [ -n "\${digest_ref}" ]; then
  /usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images tag --force "\${digest_ref}" "\${image}" >/dev/null 2>&1 || true
fi
if ! /usr/bin/timeout 60 /usr/local/bin/k3s crictl inspecti "\${image}" >/dev/null 2>&1; then
  echo "warning: CRI inspect did not see \${image}; kubelet will be verified by pod state" >&2
fi
/usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images ls -q | grep -Fx "\${image}" >/dev/null
SCRIPT
)
  run_hostroot_pod_script "${node}" "longhorn-manager-cache-repair-${node}" 900 "${host_script}"
}

repair_longhorn_manager_cache_deadlock() {
  local nodes node rc=0
  nodes="$(longhorn_manager_image_pull_nodes || true)"
  if [[ -z "${nodes}" ]]; then
    log "longhorn-manager-cache-repair=not-needed"
    return 0
  fi
  if [[ ! -f "${LONGHORN_MANAGER_CACHE_BUNDLE_FILE}" ]]; then
    warn "Longhorn manager cache bundle not found at ${LONGHORN_MANAGER_CACHE_BUNDLE_FILE}; skipping surgical manager cache repair."
    return 1
  fi
  while IFS= read -r node; do
    [[ -z "${node}" ]] && continue
    repair_longhorn_manager_cache_node "${node}" || rc=$?
  done <<< "${nodes}"
  return "${rc}"
}

seed_longhorn_unlock_images_ssh() {
  local images_text nodes node rc=0
  [[ -f "${LONGHORN_UNLOCK_BUNDLE_FILE}" ]] || die "Longhorn unlock bundle missing at ${LONGHORN_UNLOCK_BUNDLE_FILE}."
  images_text="$(longhorn_unlock_images_text)"
  [[ -n "${images_text}" ]] || die "No Longhorn unlock images listed in ${LONGHORN_UNLOCK_IMAGES_FILE}"
  nodes="$(list_ready_longhorn_seed_nodes)"
  [[ -n "${nodes}" ]] || die "No Ready Longhorn nodes match architecture ${BOOTSTRAP_BUNDLE_ARCH}."
  while IFS= read -r node; do
    [[ -z "${node}" ]] && continue
    run_ssh_longhorn_bundle_import "${node}" "${LONGHORN_UNLOCK_BUNDLE_FILE}" "${images_text}" || rc=$?
    if [[ "${rc}" -ne 0 ]]; then
      warn "SSH image import failed on ${node}."
      break
    fi
  done <<< "${nodes}"
  return "${rc}"
}

list_ready_longhorn_seed_nodes() {
  kubectl get nodes -l longhorn-host=true \
    -o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,ARCH:.metadata.labels.kubernetes\.io/arch' \
    --no-headers 2>/dev/null \
    | awk -v arch="${BOOTSTRAP_BUNDLE_ARCH}" '$2=="True" && $3==arch {print $1}'
}

list_bootstrap_seed_nodes() {
  local nodes
  nodes="$(list_ready_longhorn_seed_nodes || true)"
  if [[ -n "${HARBOR_TARGET_NODE}" ]] && node_is_ready "${HARBOR_TARGET_NODE}"; then
    nodes="$(printf '%s\n%s\n' "${nodes}" "${HARBOR_TARGET_NODE}")"
  fi
  printf '%s\n' "${nodes}" | sed '/^[[:space:]]*$/d' | sort -u
}

seed_bootstrap_images() {
  local images_text control_ip bundle_name helper_script_content seed_rc=0 node nodes
  [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Bootstrap bundle not found at ${HARBOR_BUNDLE_FILE}"
  ensure_harbor_target_node
  ensure_harbor_host_label
  images_text="$(bootstrap_images_text)"
  [[ -n "${images_text}" ]] || die "No bootstrap images listed in ${BOOTSTRAP_IMAGES_FILE}"
  nodes="$(list_bootstrap_seed_nodes)"
  [[ -n "${nodes}" ]] || die "No Ready Longhorn or Harbor bootstrap nodes available for image seed."
  bundle_name="$(basename "${HARBOR_BUNDLE_FILE}")"
  start_bundle_server
  control_ip="$(control_host_ip)"
  helper_script_content=$(cat <<SCRIPT
set -euo pipefail
if [[ "${REFRESH_BOOTSTRAP_IMAGE_ALIASES}" == "1" ]]; then
  while IFS= read -r image; do
    [[ -z "\${image}" ]] && continue
    nsenter --target 1 --mount --uts --ipc --net --pid /usr/local/bin/k3s ctr images rm "\${image}" >/dev/null 2>&1 || true
  done <<'IMAGES'
${images_text}
IMAGES
fi
curl -fsSL "http://${control_ip}:${BUNDLE_HTTP_PORT}/${bundle_name}" \
  | zstd -dc \
  | nsenter --target 1 --mount --uts --ipc --net --pid /usr/local/bin/k3s ctr images import -
while IFS= read -r image; do
  [[ -z "\${image}" ]] && continue
  nsenter --target 1 --mount --uts --ipc --net --pid /usr/local/bin/k3s ctr images ls | awk '{print \$1}' | grep -Fx "\${image}" >/dev/null
done <<'IMAGES'
${images_text}
IMAGES
SCRIPT
)
  while IFS= read -r node; do
    [[ -n "${node}" ]] || continue
    log "bootstrap-image-seed-node=${node}"
    if run_hostroot_pod_bundle_import "${node}" 1800 "${images_text}"; then
      continue
    fi
    warn "Hostroot seed pod unavailable or failed on ${node}; falling back to dedicated helper pod."
    run_helper_pod "${node}" "bootstrap-seed-${node}" 1800 "${helper_script_content}" || seed_rc=$?
    if [[ "${seed_rc}" -ne 0 ]]; then
      break
    fi
  done <<< "${nodes}"
  stop_bundle_server
  [[ "${seed_rc}" -eq 0 ]] || return "${seed_rc}"
  BOOTSTRAP_IMAGES_SEEDED=1
  mark_checkpoint startup_bootstrap_images_seeded
}

seed_bootstrap_images_if_needed() {
  if [[ "${BOOTSTRAP_IMAGES_SEEDED}" -eq 1 ]]; then
    log "Bootstrap images already seeded during this run."
    return 0
  fi
  if harbor_is_ready; then
    log "Harbor registry API is healthy; skipping bootstrap image seed."
    return 0
  fi
  if [[ "${SKIP_HARBOR_SEED}" -ne 0 ]]; then
    warn "Skipping bootstrap image seed/import by request."
    return 0
  fi
  if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
    prewarm_node_helper_image
  fi
  seed_bootstrap_images
}

seed_harbor_images() {
  seed_bootstrap_images
}

bootstrap_local_minimal() {
  apply_kustomization infrastructure/core
  apply_kustomization infrastructure/sources/helm
  apply_kustomization infrastructure/longhorn/core
  apply_kustomization infrastructure/metallb
  apply_kustomization infrastructure/traefik
  apply_kustomization infrastructure/vault-csi
  apply_kustomization infrastructure/vault-injector
  apply_kustomization services/vault
  apply_kustomization infrastructure/postgres
  apply_kustomization services/gitea
}

bootstrap_local_harbor() {
  apply_kustomization services/harbor
}

reconcile_kustomization_with_self_heal() {
  local item="$1"
  if [[ "${EXECUTE}" -eq 0 ]]; then
    run flux reconcile kustomization "${item}" -n flux-system --with-source --timeout=15m
    return 0
  fi
  local attempt output rc
  for attempt in 1 2; do
    set +e
    output="$(flux reconcile kustomization "${item}" -n flux-system --with-source --timeout=15m 2>&1)"
    rc=$?
    set -e
    if (( rc == 0 )); then
      [[ -n "${output}" ]] && printf '%s\n' "${output}"
      return 0
    fi
    [[ -n "${output}" ]] && printf '%s\n' "${output}" >&2
    if (( attempt == 1 )) && grep -Eqi 'immutable|field is immutable|cannot patch.*Job|Job.*invalid' <<< "${output}"; then
      warn "Flux reconcile for '${item}' failed due immutable Job/template signal. Attempting self-heal."
      heal_failed_flux_jobs || true
      trigger_flux_reconcile_all || true
      sleep 5
      continue
    fi
    return "${rc}"
  done
}

reconcile_stage() {
  local stage_name="$1"
  shift
  if ! command -v flux >/dev/null 2>&1; then
    local now
    now="$(date --iso-8601=seconds)"
    run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="${now}" --overwrite
    return 0
  fi
  local item
  for item in "$@"; do
    reconcile_kustomization_with_self_heal "${item}"
  done
  mark_checkpoint "reconciled_${stage_name}"
}

resume_flux_and_reconcile() {
  patch_flux_suspend_all false
  if command -v flux >/dev/null 2>&1; then
    run flux reconcile source git flux-system -n flux-system --timeout=3m
  fi
  reconcile_stage core core helm longhorn metallb traefik vault-csi vault-injector
  check_ingress_stack
  check_longhorn_stack
  reconcile_stage stateful vault postgres gitea
  check_vault_stack
  check_postgres_stack
  check_gitea_stack
  reconcile_stage registry harbor
  check_harbor_stack
  check_harbor_endpoint
  run_harbor_pull_canary
}

status_report() {
  local battery flux_ready flux_url flux_branch flux_url_drift flux_branch_drift harbor_code workers ingress_hosts_count
  local mail_safeguards_ok
  local effective_target effective_canary
  local labeled_nodes
  battery="$(read_ups_battery || true)"
  flux_ready="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
  flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)"
  flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)"
  flux_url_drift=false
  flux_branch_drift=false
  if [[ -n "${EXPECTED_FLUX_URL}" && -n "${flux_url}" && "${flux_url}" != "${EXPECTED_FLUX_URL}" ]]; then
    flux_url_drift=true
  fi
  if [[ -n "${EXPECTED_FLUX_BRANCH}" && -n "${flux_branch}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then
    flux_branch_drift=true
  fi
  ingress_hosts_count="$(list_ingress_hosts | sed '/^[[:space:]]*$/d' | wc -l | tr -d ' ')"
  harbor_code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)"
  workers="$(discover_workers_csv 2>/dev/null || true)"
  effective_target="${HARBOR_TARGET_NODE}"
  if ! node_is_ready "${effective_target}"; then
    effective_target="$(select_ready_arm64_worker || true)"
  fi
  effective_canary="${HARBOR_CANARY_NODE}"
  if ! node_is_ready "${effective_canary}"; then
    effective_canary="${effective_target}"
  fi
  echo "mode=status"
  echo "shutdown_mode=${SHUTDOWN_MODE}"
  echo "bundle_file=${HARBOR_BUNDLE_FILE}"
  echo "bundle_present=$([[ -f "${HARBOR_BUNDLE_FILE}" ]] && echo true || echo false)"
  echo "bootstrap_images_file=${BOOTSTRAP_IMAGES_FILE}"
  echo "bootstrap_images_file_present=$([[ -f "${BOOTSTRAP_IMAGES_FILE}" ]] && echo true || echo false)"
  echo "bootstrap_bundle_arch=${BOOTSTRAP_BUNDLE_ARCH}"
  echo "replica_snapshot_file=${REPLICA_SNAPSHOT_FILE}"
  echo "replica_snapshot_present=$([[ -f "${REPLICA_SNAPSHOT_FILE}" ]] && echo true || echo false)"
  echo "node_helper_image=${NODE_HELPER_IMAGE}"
  echo "harbor_target_node=${effective_target:-unknown}"
  echo "harbor_canary_node=${effective_canary:-unknown}"
  labeled_nodes="$(kubectl get nodes -l "${HARBOR_HOST_LABEL_KEY}=true" -o jsonpath='{range .items[*]}{.metadata.name}{","}{end}' 2>/dev/null || true)"
  labeled_nodes="${labeled_nodes%,}"
  echo "harbor_host_label_key=${HARBOR_HOST_LABEL_KEY}"
  echo "harbor_host_label_nodes=${labeled_nodes:-none}"
  echo "workers=${workers}"
  echo "recovery_pending=${RECOVERY_PENDING}"
  echo "startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}"
  echo "last_checkpoint=${LAST_CHECKPOINT}"
  echo "ups_host=${UPS_HOST_IN_USE:-${UPS_HOST}}"
  echo "ups_battery=${battery:-unknown}"
  echo "flux_source_expected_url=${EXPECTED_FLUX_URL}"
  echo "flux_source_expected_branch=${EXPECTED_FLUX_BRANCH}"
  echo "flux_source_actual_url=${flux_url:-unknown}"
  echo "flux_source_actual_branch=${flux_branch:-unknown}"
  echo "flux_source_url_drift=${flux_url_drift}"
  echo "flux_source_branch_drift=${flux_branch_drift}"
  echo "flux_source_ready=${flux_ready:-unknown}"
  echo "ingress_hosts_count=${ingress_hosts_count}"
  if check_mail_safeguards_once 1; then
    mail_safeguards_ok=true
  else
    mail_safeguards_ok=false
  fi
  echo "mail_startup_safeguards_required=${STARTUP_REQUIRE_MAIL_SAFEGUARDS}"
  echo "mail_startup_safeguards_ok=${mail_safeguards_ok}"
  echo "mail_startup_host=${MAIL_STARTUP_HOST}"
  echo "mail_startup_ports=${MAIL_STARTUP_TCP_PORTS}"
  echo "harbor_http=${harbor_code:-unknown}"
  kubectl get ingressclass traefik >/dev/null 2>&1 && echo "traefik_ingressclass=true" || echo "traefik_ingressclass=false"
  kubectl -n traefik get deploy traefik >/dev/null 2>&1 && echo "traefik_deploy=true" || echo "traefik_deploy=false"
  kubectl -n longhorn-system get ds longhorn-manager >/dev/null 2>&1 && echo "longhorn_manager=true" || echo "longhorn_manager=false"
  kubectl -n vault get sts vault >/dev/null 2>&1 && echo "vault_statefulset=true" || echo "vault_statefulset=false"
  kubectl -n postgres get sts postgres >/dev/null 2>&1 && echo "postgres_statefulset=true" || echo "postgres_statefulset=false"
  kubectl -n gitea get deploy gitea >/dev/null 2>&1 && echo "gitea_deploy=true" || echo "gitea_deploy=false"
  kubectl -n harbor get deploy harbor-core >/dev/null 2>&1 && echo "harbor_deploy=true" || echo "harbor_deploy=false"
}

planned_shutdown() {
  local workers_csv
  workers_csv="$(discover_workers_csv 2>/dev/null || true)"
  as_array_from_csv "${workers_csv}" WORKER_NODES
  as_array_from_csv "titan-0a,titan-0b,titan-0c" CONTROL_PLANE_NODES

  RECOVERY_PENDING=1
  STARTUP_ATTEMPTED_DURING_OUTAGE=0
  save_recovery_state 1 0 shutdown_started

  if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
    KEEP_PREWARM_DAEMONSET=1
    prewarm_node_helper_image
    mark_checkpoint shutdown_helper_prewarmed
  fi

  if [[ "${SKIP_ETCD_SNAPSHOT}" -eq 0 ]]; then
    local ts
    ts="$(date +%Y%m%d-%H%M%S)"
    run_host_command_via_helper "${CONTROL_PLANE_NODES[0]}" "etcd-snapshot" 300 "/usr/local/bin/k3s etcd-snapshot save --name pre-shutdown-${ts}"
    mark_checkpoint shutdown_snapshot_complete
  else
    warn "Skipping etcd snapshot by request."
  fi

  save_workload_replica_snapshot
  if [[ "${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT}" == "1" || "${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT}" == "true" ]]; then
    local replica_count
    replica_count="$(replica_snapshot_count)"
    if [[ ! "${replica_count}" =~ ^[0-9]+$ ]]; then
      replica_count=0
    fi
    if (( replica_count == 0 )); then
      die "Replica snapshot is empty at ${REPLICA_SNAPSHOT_FILE}; refusing shutdown to avoid startup restore deadlock."
    fi
  fi
  mark_checkpoint shutdown_replicas_snapshot

  patch_flux_suspend_all true
  best_effort_scale_down_apps
  mark_checkpoint shutdown_apps_scaled_down

  if [[ "${SKIP_DRAIN}" -eq 0 ]]; then
    best_effort_drain_workers "${DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}"
    mark_checkpoint shutdown_workers_drained
  else
    warn "Skipping worker drain by request."
  fi

  local node
  if [[ "${SHUTDOWN_MODE}" == "cluster-only" ]]; then
    warn "shutdown-mode=cluster-only: stopping k3s services only; host poweroff is disabled."
  else
    log "shutdown-mode=host-poweroff: scheduling host poweroff after service stop."
  fi

  for node in "${WORKER_NODES[@]}"; do
    [[ -z "${node}" ]] && continue
    if [[ "${SHUTDOWN_MODE}" == "cluster-only" ]]; then
      schedule_host_service_stop_via_helper "${node}" k3s-agent 20
    else
      schedule_host_shutdown_via_helper "${node}" k3s-agent 20
    fi
  done
  mark_checkpoint shutdown_workers_scheduled

  for node in "${CONTROL_PLANE_NODES[@]}"; do
    [[ -z "${node}" ]] && continue
    if [[ "${SHUTDOWN_MODE}" == "cluster-only" ]]; then
      schedule_host_service_stop_via_helper "${node}" k3s 45
    else
      schedule_host_shutdown_via_helper "${node}" k3s 45
    fi
  done
  if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
    cleanup_prewarm_daemonset
  fi
  mark_checkpoint shutdown_control_planes_scheduled
  if [[ "${SHUTDOWN_MODE}" == "cluster-only" ]]; then
    log "Cluster-only shutdown actions scheduled (hosts remain powered on)."
  else
    log "Shutdown + host poweroff actions scheduled on hosts."
  fi
}

emergency_shutdown_after_outage() {
  warn "Entering outage-aware emergency shutdown path due insufficient startup budget."
  patch_flux_suspend_all true || true
  best_effort_scale_down_apps || true
  local workers_csv
  workers_csv="$(discover_workers_csv 2>/dev/null || true)"
  as_array_from_csv "${workers_csv}" WORKER_NODES
  best_effort_drain_workers "${EMERGENCY_DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}" || true
  planned_shutdown
}

startup_flow() {
  if [[ "${RECOVERY_PENDING}" -eq 1 ]]; then
    if ! ensure_minimum_battery_for_bootstrap; then
      if [[ "${STARTUP_ATTEMPTED_DURING_OUTAGE}" -eq 1 ]]; then
        emergency_shutdown_after_outage
        exit 1
      fi
      warn "Startup deferred due low battery after recent outage; marking for second-outage fallback."
      save_recovery_state 1 1 deferred_low_battery
      exit 1
    fi
    STARTUP_ATTEMPTED_DURING_OUTAGE=1
    save_recovery_state 1 1 waiting_for_api
  fi

  if ! wait_for_api; then
    die "Kubernetes API did not become reachable in time."
  fi
  mark_checkpoint startup_api_ready

  ensure_harbor_target_node
  ensure_harbor_host_label
  mark_checkpoint startup_harbor_host_labeled

  if [[ -n "${FORCE_FLUX_URL}" ]]; then
    warn "Breakglass: forcing Flux source URL to '${FORCE_FLUX_URL}'."
    run kubectl -n flux-system patch gitrepository flux-system --type=merge -p "{\"spec\":{\"url\":\"${FORCE_FLUX_URL}\"}}"
    mark_checkpoint startup_flux_url_forced
  fi

  if [[ -n "${FORCE_FLUX_BRANCH}" ]]; then
    run kubectl -n flux-system patch gitrepository flux-system --type=merge -p "{\"spec\":{\"ref\":{\"branch\":\"${FORCE_FLUX_BRANCH}\"}}}"
    mark_checkpoint startup_flux_branch_forced
  fi

  assert_flux_source_expected
  seed_bootstrap_images_if_needed

  if [[ "${SKIP_LOCAL_BOOTSTRAP}" -eq 0 ]]; then
    if ! kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q True; then
      warn "Flux source not Ready; executing local bootstrap fallback path."
      bootstrap_local_minimal
      mark_checkpoint startup_local_bootstrap_complete
      check_ingress_stack
      check_longhorn_stack
      check_vault_stack
      check_postgres_stack
      check_gitea_stack

      if [[ "${SKIP_HARBOR_BOOTSTRAP}" -eq 0 ]]; then
        if harbor_is_ready; then
          log "Harbor already healthy; skipping Harbor seed/bootstrap."
        else
          seed_bootstrap_images_if_needed
          bootstrap_local_harbor
          mark_checkpoint startup_local_harbor_applied
          check_harbor_stack
          check_harbor_endpoint
        fi
      else
        warn "Skipping Harbor bootstrap fallback by request."
      fi
    fi
  else
    warn "Skipping local bootstrap fallback by request."
  fi

  resume_flux_and_reconcile
  wait_for_flux_kustomizations_ready
  restore_workload_replica_snapshot
  restore_zero_scaled_helm_workloads
  wait_for_startup_workloads_ready
  wait_for_startup_service_checklist
  wait_for_startup_stability_window
  if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
    prewarm_node_helper_image
    mark_checkpoint startup_helper_prewarmed
  fi
  clear_recovery_state
  log "Startup flow complete."
}

prepare_flow() {
  [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Bootstrap bundle missing at ${HARBOR_BUNDLE_FILE}. Build and copy it to the canonical control host first."
  [[ -f "${BOOTSTRAP_IMAGES_FILE}" ]] || die "Bootstrap image list missing at ${BOOTSTRAP_IMAGES_FILE}."
  ensure_harbor_target_node
  ensure_harbor_host_label
  mark_checkpoint prepare_harbor_host_labeled
  if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
    prewarm_node_helper_image
    mark_checkpoint prepare_helper_prewarmed
  fi
  log "Prepare flow complete."
}

harbor_seed_flow() {
  [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Bootstrap bundle missing at ${HARBOR_BUNDLE_FILE}. Build and copy it to the canonical control host first."
  [[ -f "${BOOTSTRAP_IMAGES_FILE}" ]] || die "Bootstrap image list missing at ${BOOTSTRAP_IMAGES_FILE}."
  if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
    prewarm_node_helper_image
    mark_checkpoint harbor_seed_helper_prewarmed
  fi
  seed_bootstrap_images
  check_harbor_endpoint
  run_harbor_pull_canary
  log "Bootstrap seed flow complete."
}

longhorn_unlock_flow() {
  require_cmd jq
  [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Bootstrap bundle missing at ${HARBOR_BUNDLE_FILE}. Build and copy it to the canonical control host first."
  [[ -f "${BOOTSTRAP_IMAGES_FILE}" ]] || die "Bootstrap image list missing at ${BOOTSTRAP_IMAGES_FILE}."
  if ! wait_for_api; then
    die "Kubernetes API did not become reachable in time."
  fi

  warn "Longhorn unlock mode will not mutate Longhorn volumes, replicas, engines, disks, PVs, or PVCs."
  if ! harbor_endpoint_is_ready 1; then
    warn "Harbor registry API is unhealthy; using local bootstrap image cache path."
  fi

  REFRESH_BOOTSTRAP_IMAGE_ALIASES=1
  freeze_longhorn_deadlock_automation
  ensure_longhorn_cache_first_policy
  remove_longhorn_manager_prepull_sidecar_if_needed
  free_longhorn_instance_manager_headroom
  delete_failed_nonstorage_pods_for_headroom
  repair_longhorn_manager_cache_deadlock || warn "Surgical Longhorn manager cache repair did not complete on every affected node."
  if [[ "${SKIP_LONGHORN_UNLOCK_BUNDLE_SEED}" -eq 0 ]]; then
    seed_longhorn_unlock_images_ssh
  else
    warn "Skipping full Longhorn unlock bundle seed by operator request."
  fi
  restart_longhorn_image_pull_backoff_pods
  recover_stuck_terminating_node_runtime_pods_after_deadlock

  if [[ "${EXECUTE}" -eq 1 ]]; then
    kubectl -n longhorn-system rollout status daemonset/longhorn-manager --timeout=5m || warn "longhorn-manager DaemonSet did not fully roll out yet."
    sleep 30
  fi
  wait_for_longhorn_control_endpoints || true
  restart_stale_critical_pods_after_longhorn_unlock
  restart_harbor_after_postgres_recovery || warn "Harbor did not fully recover after Postgres became ready."
  if harbor_endpoint_is_ready 1; then
    run_harbor_pull_canary || warn "Harbor pull canary failed after registry recovery."
    restore_recovered_worker_scheduling_after_deadlock
    restore_longhorn_unlock_optional_workloads
    delete_safe_stale_terminating_replicaset_pods_after_deadlock
    restart_image_pull_backoff_pods_after_harbor_recovery || true
    resume_deadlock_automation_after_core_recovery || true
  fi
  report_longhorn_unlock_status
  mark_checkpoint longhorn_unlock_complete
  log "Longhorn unlock flow complete."
}

load_recovery_state
log "mode=${MODE} execute=${EXECUTE}"
log "shutdown-mode=${SHUTDOWN_MODE}"
log "recovery-state-file=${RECOVERY_STATE_FILE}"
log "bundle-file=${HARBOR_BUNDLE_FILE}"
log "bootstrap-images-file=${BOOTSTRAP_IMAGES_FILE}"
log "bootstrap-bundle-arch=${BOOTSTRAP_BUNDLE_ARCH}"
log "node-helper-image=${NODE_HELPER_IMAGE}"
log "harbor-target-node-config=${HARBOR_TARGET_NODE:-auto}"
log "harbor-canary-node-config=${HARBOR_CANARY_NODE:-auto}"
log "harbor-host-label-key=${HARBOR_HOST_LABEL_KEY}"
log "expected-flux-url=${EXPECTED_FLUX_URL}"
log "expected-flux-branch=${EXPECTED_FLUX_BRANCH}"
log "startup-optional-kustomizations=${STARTUP_OPTIONAL_KUSTOMIZATIONS:-none}"
report_flux_source_state

case "${MODE}" in
  status)
    status_report
    ;;
  prepare)
    prepare_flow
    ;;
  bootstrap-seed|harbor-seed|longhorn-seed)
    harbor_seed_flow
    ;;
  longhorn-unlock)
    longhorn_unlock_flow
    ;;
  shutdown)
    planned_shutdown
    ;;
  startup)
    startup_flow
    ;;
esac