From d880fac673fe51451d9eb34e555c037258bda963 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 6 Apr 2026 04:47:05 -0300
Subject: [PATCH] hecate: harden titan-24 cleanup and ups telemetry

---
 scripts/bootstrap/recovery-config.env |   14 +
 scripts/cluster_power_recovery.sh     | 1045 ++++++++++++++++++-------
 2 files changed, 775 insertions(+), 284 deletions(-)
 create mode 100644 scripts/bootstrap/recovery-config.env
diff --git a/scripts/bootstrap/recovery-config.env b/scripts/bootstrap/recovery-config.env
new file mode 100644
index 00000000..c2f789d9
--- /dev/null
+++ b/scripts/bootstrap/recovery-config.env
@@ -0,0 +1,14 @@
+CANONICAL_CONTROL_HOST="titan-db"
+DEFAULT_FLUX_BRANCH="main"
+STATE_SUBDIR=".local/share/hecate"
+HARBOR_BUNDLE_BASENAME="harbor-bootstrap-v2.14.1-arm64.tar.zst"
+HARBOR_TARGET_NODE="titan-05"
+HARBOR_CANARY_NODE="titan-04"
+HARBOR_CANARY_IMAGE="registry.bstein.dev/bstein/kubectl:1.35.0"
+NODE_HELPER_IMAGE="registry.bstein.dev/bstein/hecate-node-helper:0.1.0"
+NODE_HELPER_NAMESPACE="maintenance"
+NODE_HELPER_SERVICE_ACCOUNT="default"
+REGISTRY_PULL_SECRET="harbor-regcred"
+BUNDLE_HTTP_PORT="8877"
+UPS_HOST="pyrphoros@localhost"
+UPS_BATTERY_KEY="battery.charge"
diff --git a/scripts/cluster_power_recovery.sh b/scripts/cluster_power_recovery.sh
index c6c806ff..9efafff1 100755
--- a/scripts/cluster_power_recovery.sh
+++ b/scripts/cluster_power_recovery.sh
@@ -1,27 +1,43 @@
 #!/usr/bin/env bash
 set -euo pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_DIR="${HECATE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}"
+BOOTSTRAP_DIR="${SCRIPT_DIR}/bootstrap"
+CONFIG_FILE="${BOOTSTRAP_DIR}/recovery-config.env"
+if [[ -f "${CONFIG_FILE}" ]]; then
+  # shellcheck disable=SC1090
+  source "${CONFIG_FILE}"
+fi
+if [[ -z "${KUBECONFIG:-}" && -f "${SCRIPT_DIR}/kubeconfig" ]]; then
+  export KUBECONFIG="${SCRIPT_DIR}/kubeconfig"
+fi
+
 usage() {
-  cat <<'USAGE'
+  cat <<USAGE
 Usage:
-  scripts/cluster_power_recovery.sh shutdown [options]
-  scripts/cluster_power_recovery.sh startup  [options]
+  scripts/cluster_power_recovery.sh <prepare|status|harbor-seed|shutdown|startup> [options]
 
 Options:
   --execute                     Actually run commands (default is dry-run)
-  --ssh-user <user>             SSH user for node commands (default: current SSH config user)
-  --control-planes <csv>        Control plane hosts (default: titan-0a,titan-0b,titan-0c)
-  --workers <csv>               Worker hosts (default: static atlas inventory, with API discovery when available)
-  --expected-flux-branch <name> Expected Flux source branch during startup checks (default: main)
-  --skip-etcd-snapshot          Skip etcd snapshot before shutdown
-  --skip-drain                  Skip worker drain during shutdown
+  --expected-flux-branch <name> Expected Flux source branch during startup checks (default: ${DEFAULT_FLUX_BRANCH:-main})
+  --force-flux-branch <name>    Startup: patch flux-system GitRepository branch to this value
+  --skip-etcd-snapshot          Shutdown: skip etcd snapshot before shutdown
+  --skip-drain                  Shutdown: skip worker drain during shutdown
   --skip-local-bootstrap        Startup: skip local bootstrap fallback applies
   --skip-harbor-bootstrap       Startup: skip Harbor recovery bootstrap stage
-  --force-flux-branch <name>    Startup: patch flux-system GitRepository branch to this value
+  --skip-harbor-seed            Startup: skip Harbor image seed/import stage
+  --skip-helper-prewarm         Prepare/Shutdown/Startup: skip node-helper prewarm
   --min-startup-battery <pct>   Minimum UPS percent required before bootstrap (default: 35)
   --ups-host <name>             UPS identifier for upsc (default: ups@localhost)
   --ups-battery-key <key>       UPS battery key for upsc (default: battery.charge)
-  --recovery-state-file <path>  Recovery state file for second-outage detection
+  --recovery-state-file <path>  Recovery state file for outage-aware restart logic
+  --harbor-bundle-file <path>   Harbor bootstrap bundle on the control host
+  --harbor-target-node <name>   Node that should host Harbor during bootstrap (default: ${HARBOR_TARGET_NODE:-titan-05})
+  --harbor-canary-image <image> Harbor-backed image used for pull canary (default: ${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0})
+  --node-helper-image <image>   Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0})
+  --bundle-http-port <port>     Temporary HTTP port used to serve bootstrap bundles (default: ${BUNDLE_HTTP_PORT:-8877})
+  --api-wait-timeout <seconds>  Startup: Kubernetes API wait timeout (default: 600)
   --drain-timeout <seconds>     Worker drain timeout for normal shutdown (default: 180)
   --emergency-drain-timeout <seconds>
                                Worker drain timeout for emergency fallback (default: 45)
@@ -29,6 +45,9 @@ Options:
   -h, --help                    Show help
 
 Examples:
+  scripts/cluster_power_recovery.sh prepare --execute
+  scripts/cluster_power_recovery.sh harbor-seed --execute
+  scripts/cluster_power_recovery.sh status
   scripts/cluster_power_recovery.sh shutdown --execute
   scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main
 USAGE
@@ -41,38 +60,48 @@ if [[ -z "${MODE}" || "${MODE}" == "-h" || "${MODE}" == "--help" ]]; then
 fi
 shift || true
 
-SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
-if [[ -z "${KUBECONFIG:-}" && -f "${SCRIPT_DIR}/kubeconfig" ]]; then
-  export KUBECONFIG="${SCRIPT_DIR}/kubeconfig"
-fi
-
-if [[ "${MODE}" != "shutdown" && "${MODE}" != "startup" ]]; then
-  echo "Unknown mode: ${MODE}" >&2
-  usage
-  exit 1
-fi
+case "${MODE}" in
+  prepare|status|harbor-seed|shutdown|startup) ;;
+  *)
+    echo "Unknown mode: ${MODE}" >&2
+    usage
+    exit 1
+    ;;
+esac
 
 EXECUTE=0
-SSH_USER=""
-CONTROL_PLANES="titan-0a,titan-0b,titan-0c"
-WORKERS=""
-DEFAULT_WORKERS="titan-04,titan-05,titan-06,titan-07,titan-08,titan-09,titan-10,titan-11,titan-12,titan-13,titan-14,titan-15,titan-17,titan-18,titan-19,titan-20,titan-21,titan-22,titan-24"
-EXPECTED_FLUX_BRANCH="main"
+EXPECTED_FLUX_BRANCH="${DEFAULT_FLUX_BRANCH:-main}"
+FORCE_FLUX_BRANCH=""
 SKIP_ETCD_SNAPSHOT=0
 SKIP_DRAIN=0
 SKIP_LOCAL_BOOTSTRAP=0
 SKIP_HARBOR_BOOTSTRAP=0
-FORCE_FLUX_BRANCH=""
-UPS_HOST="ups@localhost"
-UPS_BATTERY_KEY="battery.charge"
-RECOVERY_STATE_FILE="${HOME}/.local/state/cluster_power_recovery.state"
-MIN_STARTUP_BATTERY=35
+SKIP_HARBOR_SEED=0
+SKIP_HELPER_PREWARM=0
+UPS_HOST="${UPS_HOST:-ups@localhost}"
+UPS_BATTERY_KEY="${UPS_BATTERY_KEY:-battery.charge}"
+MIN_STARTUP_BATTERY="${MIN_STARTUP_BATTERY:-35}"
+REQUIRE_UPS_BATTERY="${REQUIRE_UPS_BATTERY:-0}"
 DRAIN_TIMEOUT_SECONDS=180
 EMERGENCY_DRAIN_TIMEOUT_SECONDS=45
-REQUIRE_UPS_BATTERY=0
+API_WAIT_TIMEOUT_SECONDS=600
+BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}"
+STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/hecate}"
+RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state"
+HARBOR_BUNDLE_FILE="${STATE_ROOT}/bundles/${HARBOR_BUNDLE_BASENAME:-harbor-bootstrap-v2.14.1-arm64.tar.zst}"
+HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-titan-05}"
+HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-titan-04}"
+HARBOR_CANARY_IMAGE="${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}"
+NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0}"
+NODE_HELPER_NAMESPACE="${NODE_HELPER_NAMESPACE:-maintenance}"
+NODE_HELPER_SERVICE_ACCOUNT="${NODE_HELPER_SERVICE_ACCOUNT:-default}"
+REGISTRY_PULL_SECRET="${REGISTRY_PULL_SECRET:-harbor-regcred}"
 
 RECOVERY_PENDING=0
 STARTUP_ATTEMPTED_DURING_OUTAGE=0
+LAST_CHECKPOINT="none"
+BUNDLE_SERVER_PID=""
+UPS_HOST_IN_USE=""
 
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -80,20 +109,12 @@ while [[ $# -gt 0 ]]; do
       EXECUTE=1
       shift
       ;;
-    --ssh-user)
-      SSH_USER="${2:-}"
-      shift 2
-      ;;
-    --control-planes)
-      CONTROL_PLANES="${2:-}"
-      shift 2
-      ;;
-    --workers)
-      WORKERS="${2:-}"
-      shift 2
-      ;;
     --expected-flux-branch)
-      EXPECTED_FLUX_BRANCH="${2:-}"
+      EXPECTED_FLUX_BRANCH="${2:?missing branch}"
+      shift 2
+      ;;
+    --force-flux-branch)
+      FORCE_FLUX_BRANCH="${2:?missing branch}"
       shift 2
       ;;
     --skip-etcd-snapshot)
@@ -112,38 +133,66 @@ while [[ $# -gt 0 ]]; do
       SKIP_HARBOR_BOOTSTRAP=1
       shift
       ;;
-    --force-flux-branch)
-      FORCE_FLUX_BRANCH="${2:-}"
-      shift 2
+    --skip-harbor-seed)
+      SKIP_HARBOR_SEED=1
+      shift
+      ;;
+    --skip-helper-prewarm)
+      SKIP_HELPER_PREWARM=1
+      shift
       ;;
     --ups-host)
-      UPS_HOST="${2:-}"
+      UPS_HOST="${2:?missing ups host}"
       shift 2
       ;;
     --ups-battery-key)
-      UPS_BATTERY_KEY="${2:-}"
+      UPS_BATTERY_KEY="${2:?missing ups key}"
       shift 2
       ;;
     --min-startup-battery)
-      MIN_STARTUP_BATTERY="${2:-}"
-      shift 2
-      ;;
-    --recovery-state-file)
-      RECOVERY_STATE_FILE="${2:-}"
-      shift 2
-      ;;
-    --drain-timeout)
-      DRAIN_TIMEOUT_SECONDS="${2:-}"
-      shift 2
-      ;;
-    --emergency-drain-timeout)
-      EMERGENCY_DRAIN_TIMEOUT_SECONDS="${2:-}"
+      MIN_STARTUP_BATTERY="${2:?missing battery threshold}"
       shift 2
       ;;
     --require-ups-battery)
       REQUIRE_UPS_BATTERY=1
       shift
       ;;
+    --recovery-state-file)
+      RECOVERY_STATE_FILE="${2:?missing state file path}"
+      shift 2
+      ;;
+    --harbor-bundle-file)
+      HARBOR_BUNDLE_FILE="${2:?missing bundle file path}"
+      shift 2
+      ;;
+    --harbor-target-node)
+      HARBOR_TARGET_NODE="${2:?missing harbor target node}"
+      shift 2
+      ;;
+    --harbor-canary-image)
+      HARBOR_CANARY_IMAGE="${2:?missing canary image}"
+      shift 2
+      ;;
+    --node-helper-image)
+      NODE_HELPER_IMAGE="${2:?missing node helper image}"
+      shift 2
+      ;;
+    --bundle-http-port)
+      BUNDLE_HTTP_PORT="${2:?missing bundle http port}"
+      shift 2
+      ;;
+    --api-wait-timeout)
+      API_WAIT_TIMEOUT_SECONDS="${2:?missing api wait timeout}"
+      shift 2
+      ;;
+    --drain-timeout)
+      DRAIN_TIMEOUT_SECONDS="${2:?missing drain timeout}"
+      shift 2
+      ;;
+    --emergency-drain-timeout)
+      EMERGENCY_DRAIN_TIMEOUT_SECONDS="${2:?missing emergency drain timeout}"
+      shift 2
+      ;;
     -h|--help)
       usage
       exit 0
@@ -165,10 +214,13 @@ require_cmd() {
 }
 
 require_cmd kubectl
-require_cmd ssh
+require_cmd bash
+require_cmd base64
+require_cmd curl
 
 log() { echo "[cluster-power] $*"; }
 warn() { echo "[cluster-power][warn] $*" >&2; }
+die() { echo "[cluster-power][error] $*" >&2; exit 1; }
 
 run() {
   if [[ "${EXECUTE}" -eq 1 ]]; then
@@ -188,81 +240,101 @@ run_shell() {
   fi
 }
 
-as_array_from_csv() {
-  local csv="$1"
-  local out_var="$2"
-  local old_ifs="${IFS}"
-  IFS=',' read -r -a _tmp <<< "${csv}"
-  IFS="${old_ifs}"
-  eval "${out_var}"'=( "${_tmp[@]}" )'
-}
-
-ssh_target() {
-  local node="$1"
-  if [[ -n "${SSH_USER}" ]]; then
-    printf "%s@%s" "${SSH_USER}" "${node}"
+apply_kustomization() {
+  local path="$1"
+  local full_path="${REPO_DIR}/${path}"
+  if [[ "${EXECUTE}" -eq 1 ]]; then
+    log "EXEC: kubectl kustomize ${full_path} --load-restrictor=LoadRestrictionsNone | kubectl apply -f -"
+    kubectl kustomize "${full_path}" --load-restrictor=LoadRestrictionsNone | kubectl apply -f -
   else
-    printf "%s" "${node}"
+    log "DRY-RUN: kubectl kustomize ${full_path} --load-restrictor=LoadRestrictionsNone | kubectl apply -f -"
   fi
 }
 
-discover_workers_csv() {
-  # Include every non-control-plane node by default (workers + accelerators).
-  kubectl get nodes \
-    -o custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\\.kubernetes\\.io/control-plane,MASTER:.metadata.labels.node-role\\.kubernetes\\.io/master \
-    --no-headers \
-    | awk '$2=="<none>" && $3=="<none>" {print $1}' \
-  | paste -sd, -
+sanitize_name() {
+  printf '%s' "$1" | tr '[:upper:]' '[:lower:]' | tr -cs 'a-z0-9-' '-'
+}
+
+state_dir() {
+  dirname "${RECOVERY_STATE_FILE}"
 }
 
 load_recovery_state() {
-  if [[ ! -f "${RECOVERY_STATE_FILE}" ]]; then
-    RECOVERY_PENDING=0
-    STARTUP_ATTEMPTED_DURING_OUTAGE=0
-    return 0
-  fi
-
+  RECOVERY_PENDING=0
+  STARTUP_ATTEMPTED_DURING_OUTAGE=0
+  LAST_CHECKPOINT="none"
+  [[ -f "${RECOVERY_STATE_FILE}" ]] || return 0
   while IFS='=' read -r key value; do
     case "${key}" in
-      recovery_pending)
-        RECOVERY_PENDING="${value}"
-        ;;
-      startup_attempted)
-        STARTUP_ATTEMPTED_DURING_OUTAGE="${value}"
-        ;;
+      recovery_pending) RECOVERY_PENDING="${value}" ;;
+      startup_attempted) STARTUP_ATTEMPTED_DURING_OUTAGE="${value}" ;;
+      last_checkpoint) LAST_CHECKPOINT="${value}" ;;
     esac
   done < "${RECOVERY_STATE_FILE}"
 }
 
 save_recovery_state() {
-  mkdir -p "$(dirname "${RECOVERY_STATE_FILE}")"
-  cat > "${RECOVERY_STATE_FILE}" <<EOF
+  [[ "${EXECUTE}" -eq 1 ]] || return 0
+  mkdir -p "$(state_dir)"
+  cat > "${RECOVERY_STATE_FILE}" <<STATE
 recovery_pending=${1}
 startup_attempted=${2}
-EOF
+last_checkpoint=${3}
+STATE
+}
+
+mark_checkpoint() {
+  LAST_CHECKPOINT="$1"
+  save_recovery_state "${RECOVERY_PENDING}" "${STARTUP_ATTEMPTED_DURING_OUTAGE}" "${LAST_CHECKPOINT}"
 }
 
 clear_recovery_state() {
-  if [[ -f "${RECOVERY_STATE_FILE}" ]]; then
-    rm -f "${RECOVERY_STATE_FILE}"
+  [[ "${EXECUTE}" -eq 1 ]] || return 0
+  rm -f "${RECOVERY_STATE_FILE}" 2>/dev/null || true
+  LAST_CHECKPOINT="none"
+}
+
+sanitize_battery_percent() {
+  local raw="$1"
+  raw="${raw##*:}"
+  raw="${raw//[[:space:]]/}"
+  raw="${raw%%.*}"
+  [[ "${raw}" =~ ^[0-9]+$ ]] || return 1
+  printf '%s' "${raw}"
+}
+
+candidate_ups_hosts() {
+  local candidate name
+  local -A seen=()
+  if [[ -n "${UPS_HOST}" ]]; then
+    seen["${UPS_HOST}"]=1
+    echo "${UPS_HOST}"
   fi
+  while IFS= read -r name; do
+    [[ -n "${name}" ]] || continue
+    for candidate in "${name}@localhost" "${name}"; do
+      [[ -n "${seen[${candidate}]+x}" ]] && continue
+      seen["${candidate}"]=1
+      echo "${candidate}"
+    done
+  done < <(upsc -l 2>/dev/null || true)
 }
 
 read_ups_battery() {
   if ! command -v upsc >/dev/null 2>&1; then
     return 1
   fi
-  local raw
-  raw="$(upsc "${UPS_HOST}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)"
-  if [[ -z "${raw}" ]]; then
-    return 1
-  fi
-  # battery.charge can include units/decimals in some setups; normalize.
-  raw="${raw%%.*}"
-  if ! [[ "${raw}" =~ ^[0-9]+$ ]]; then
-    return 1
-  fi
-  echo "${raw}"
+  local host raw parsed
+  while IFS= read -r host; do
+    raw="$(upsc "${host}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)"
+    [[ -n "${raw}" ]] || continue
+    parsed="$(sanitize_battery_percent "${raw}" || true)"
+    [[ -n "${parsed}" ]] || continue
+    UPS_HOST_IN_USE="${host}"
+    printf '%s' "${parsed}"
+    return 0
+  done < <(candidate_ups_hosts)
+  return 1
 }
 
 ensure_minimum_battery_for_bootstrap() {
@@ -276,8 +348,7 @@ ensure_minimum_battery_for_bootstrap() {
     warn "Unable to read UPS battery status; continuing without hard battery gating."
     return 0
   fi
-
-  log "ups-battery=${battery}%"
+  log "ups-battery=${battery}% host=${UPS_HOST_IN_USE:-${UPS_HOST}}"
   if (( battery < MIN_STARTUP_BATTERY )); then
     warn "UPS battery ${battery}% below minimum startup threshold ${MIN_STARTUP_BATTERY}%."
     return 1
@@ -285,14 +356,36 @@ ensure_minimum_battery_for_bootstrap() {
   return 0
 }
 
-emergency_shutdown_after_outage() {
-  warn "Entering outage-aware emergency shutdown path due insufficient startup budget."
-  patch_flux_suspend_all true || true
-  best_effort_scale_down_apps
-  # Give the cluster one short chance to drain, then force progress.
-  best_effort_drain_workers "${EMERGENCY_DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}"
-  stop_workers_agents "${WORKER_NODES[@]}"
-  stop_control_planes "${CONTROL_PLANE_NODES[@]}"
+report_flux_source_state() {
+  local flux_url flux_branch
+  flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)"
+  flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)"
+  [[ -n "${flux_url}" ]] && log "flux-source-url=${flux_url}"
+  if [[ -n "${flux_branch}" ]]; then
+    log "flux-source-branch=${flux_branch}"
+    if [[ "${MODE}" == "startup" && -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then
+      warn "Flux source branch is '${flux_branch}'. Expected '${EXPECTED_FLUX_BRANCH}' for canonical recovery."
+    fi
+  fi
+}
+
+wait_for_api() {
+  local attempts=$(( API_WAIT_TIMEOUT_SECONDS / 5 ))
+  if (( attempts < 1 )); then
+    attempts=1
+  fi
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: skipping live Kubernetes API wait"
+    return 0
+  fi
+  local i
+  for i in $(seq 1 "${attempts}"); do
+    if kubectl version --request-timeout=5s >/dev/null 2>&1; then
+      return 0
+    fi
+    sleep 5
+  done
+  return 1
 }
 
 patch_flux_suspend_all() {
@@ -317,39 +410,6 @@ patch_flux_suspend_all() {
   done <<< "${hr_list}"
 }
 
-report_flux_source_state() {
-  local flux_url flux_branch
-  flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)"
-  flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)"
-
-  if [[ -n "${flux_url}" ]]; then
-    log "flux-source-url=${flux_url}"
-  fi
-  if [[ -n "${flux_branch}" ]]; then
-    log "flux-source-branch=${flux_branch}"
-    if [[ "${MODE}" == "startup" && -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then
-      warn "Flux source branch is '${flux_branch}'. Expected '${EXPECTED_FLUX_BRANCH}' for canonical cold-start recovery. Use --force-flux-branch ${EXPECTED_FLUX_BRANCH} if needed."
-    fi
-  fi
-}
-
-wait_for_api() {
-  local attempts="${1:-90}"
-  local sleep_s="${2:-2}"
-  if [[ "${EXECUTE}" -eq 0 ]]; then
-    log "DRY-RUN: skipping live Kubernetes API wait"
-    return 0
-  fi
-  local i
-  for i in $(seq 1 "${attempts}"); do
-    if kubectl version --request-timeout=5s >/dev/null 2>&1; then
-      return 0
-    fi
-    sleep "${sleep_s}"
-  done
-  return 1
-}
-
 best_effort_scale_down_apps() {
   local excludes='^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$'
   local ns_list
@@ -364,6 +424,23 @@ best_effort_scale_down_apps() {
   done <<< "${ns_list}"
 }
 
+discover_workers_csv() {
+  kubectl get nodes \
+    -o custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\\.kubernetes\\.io/control-plane,MASTER:.metadata.labels.node-role\\.kubernetes\\.io/master \
+    --no-headers \
+    | awk '$2=="<none>" && $3=="<none>" {print $1}' \
+    | paste -sd, -
+}
+
+as_array_from_csv() {
+  local csv="$1"
+  local out_var="$2"
+  local old_ifs="${IFS}"
+  IFS=',' read -r -a _tmp <<< "${csv}"
+  IFS="${old_ifs}"
+  eval "${out_var}"'=( "${_tmp[@]}" )'
+}
+
 best_effort_drain_workers() {
   local timeout_seconds="$1"
   shift || true
@@ -384,183 +461,583 @@ best_effort_drain_workers() {
   done
 }
 
-stop_workers_agents() {
-  local workers=("$@")
-  local node target
-  for node in "${workers[@]}"; do
-    [[ -z "${node}" ]] && continue
-    target="$(ssh_target "${node}")"
-    run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl stop k3s-agent || true"
+wait_for_rollout() {
+  local namespace="$1"
+  local kind="$2"
+  local name="$3"
+  local timeout="$4"
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: kubectl -n ${namespace} rollout status ${kind}/${name} --timeout=${timeout}"
+    return 0
+  fi
+  kubectl -n "${namespace}" rollout status "${kind}/${name}" --timeout="${timeout}"
+}
+
+check_ingress_stack() {
+  kubectl get ingressclass traefik >/dev/null
+  wait_for_rollout traefik deployment traefik 5m
+}
+
+check_longhorn_stack() {
+  wait_for_rollout longhorn-system daemonset longhorn-manager 10m
+  wait_for_rollout longhorn-system deployment longhorn-ui 10m
+}
+
+check_vault_stack() {
+  wait_for_rollout vault statefulset vault 10m
+  if [[ "${EXECUTE}" -eq 1 ]]; then
+    kubectl -n vault exec vault-0 -- sh -ceu 'VAULT_ADDR=http://127.0.0.1:8200 vault status >/dev/null'
+  fi
+}
+
+check_postgres_stack() {
+  wait_for_rollout postgres statefulset postgres 10m
+  if [[ "${EXECUTE}" -eq 1 ]]; then
+    kubectl -n postgres exec postgres-0 -c postgres -- sh -ceu 'pg_isready -h 127.0.0.1 -p 5432 >/dev/null'
+  fi
+}
+
+check_gitea_stack() {
+  wait_for_rollout gitea deployment gitea 10m
+}
+
+check_harbor_stack() {
+  wait_for_rollout harbor statefulset harbor-redis 10m
+  wait_for_rollout harbor deployment harbor-core 10m
+  wait_for_rollout harbor deployment harbor-jobservice 10m
+  wait_for_rollout harbor deployment harbor-portal 10m
+  wait_for_rollout harbor deployment harbor-registry 10m
+}
+
+check_harbor_endpoint() {
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/"
+    return 0
+  fi
+  local code
+  code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)"
+  case "${code}" in
+    200|401)
+      log "harbor-endpoint=http-${code}"
+      ;;
+    *)
+      die "Harbor endpoint check failed with HTTP ${code:-unknown}"
+      ;;
+  esac
+}
+
+wait_for_pod_phase() {
+  local namespace="$1"
+  local pod="$2"
+  local expected_phase="$3"
+  local timeout_seconds="$4"
+  local start now phase
+  start="$(date +%s)"
+  while true; do
+    phase="$(kubectl -n "${namespace}" get pod "${pod}" -o jsonpath='{.status.phase}' 2>/dev/null || true)"
+    if [[ "${phase}" == "${expected_phase}" ]]; then
+      return 0
+    fi
+    if [[ "${phase}" == "Failed" ]]; then
+      return 1
+    fi
+    now="$(date +%s)"
+    if (( now - start >= timeout_seconds )); then
+      return 1
+    fi
+    sleep 2
   done
 }
 
-start_workers_agents() {
-  local workers=("$@")
-  local node target
-  for node in "${workers[@]}"; do
-    [[ -z "${node}" ]] && continue
-    target="$(ssh_target "${node}")"
-    run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl start k3s-agent || true"
-  done
+harbor_is_ready() {
+  kubectl -n harbor get deploy harbor-core harbor-jobservice harbor-portal harbor-registry >/dev/null 2>&1 || return 1
+  local code
+  code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)"
+  [[ "${code}" == "200" || "${code}" == "401" ]]
 }
 
-stop_control_planes() {
-  local cps=("$@")
-  local node target
-  for node in "${cps[@]}"; do
-    [[ -z "${node}" ]] && continue
-    target="$(ssh_target "${node}")"
-    run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl stop k3s || true"
-  done
+run_harbor_pull_canary() {
+  local pod="hecate-harbor-canary"
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${HARBOR_CANARY_NODE}"
+    return 0
+  fi
+  timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
+  cat <<CANARY | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: ${pod}
+  namespace: ${NODE_HELPER_NAMESPACE}
+spec:
+  nodeName: ${HARBOR_CANARY_NODE}
+  restartPolicy: Never
+  imagePullSecrets:
+    - name: ${REGISTRY_PULL_SECRET}
+  tolerations:
+    - operator: Exists
+  containers:
+    - name: canary
+      image: ${HARBOR_CANARY_IMAGE}
+      imagePullPolicy: Always
+      command: ["sh", "-ceu", "echo harbor-canary-ok"]
+CANARY
+  if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded 180; then
+    kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true
+    timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true
+    timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
+    return 1
+  fi
+  timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true
+  timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
 }
 
-start_control_planes() {
-  local cps=("$@")
-  local node target
-  for node in "${cps[@]}"; do
-    [[ -z "${node}" ]] && continue
-    target="$(ssh_target "${node}")"
-    run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl start k3s || true"
-  done
+run_helper_pod() {
+  local node="$1"
+  local purpose="$2"
+  local timeout_seconds="$3"
+  local script_content="$4"
+  local pod="hecate-$(sanitize_name "${purpose}")-$(date +%H%M%S)"
+  local encoded_script
+  encoded_script="$(printf '%s' "${script_content}" | base64 -w0)"
+
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: helper pod ${pod} on ${node} for ${purpose}"
+    return 0
+  fi
+
+  cat <<POD | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: ${pod}
+  namespace: ${NODE_HELPER_NAMESPACE}
+spec:
+  nodeName: ${node}
+  restartPolicy: Never
+  serviceAccountName: ${NODE_HELPER_SERVICE_ACCOUNT}
+  imagePullSecrets:
+    - name: ${REGISTRY_PULL_SECRET}
+  hostNetwork: true
+  hostPID: true
+  tolerations:
+    - operator: Exists
+  containers:
+    - name: helper
+      image: ${NODE_HELPER_IMAGE}
+      imagePullPolicy: IfNotPresent
+      securityContext:
+        privileged: true
+      command: ["/bin/bash", "-ceu"]
+      args:
+        - |
+          printf '%s' '${encoded_script}' | base64 -d >/tmp/hecate-step.sh
+          chmod +x /tmp/hecate-step.sh
+          /tmp/hecate-step.sh
+POD
+
+  if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded "${timeout_seconds}"; then
+    kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true
+    timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true
+    timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
+    return 1
+  fi
+  timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true
+  timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
 }
 
-take_etcd_snapshot() {
-  local cp="$1"
-  local target
-  target="$(ssh_target "${cp}")"
-  local ts
-  ts="$(date +%Y%m%d-%H%M%S)"
-  run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" \
-    "sudo k3s etcd-snapshot save --name pre-shutdown-${ts}"
+run_host_command_via_helper() {
+  local node="$1"
+  local purpose="$2"
+  local timeout_seconds="$3"
+  local host_command="$4"
+  local encoded_command
+  encoded_command="$(printf '%s' "${host_command}" | base64 -w0)"
+  local script_content
+  script_content=$(cat <<SCRIPT
+set -euo pipefail
+HOST_COMMAND="\$(printf '%s' '${encoded_command}' | base64 -d)"
+nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu "\${HOST_COMMAND}"
+SCRIPT
+)
+  run_helper_pod "${node}" "${purpose}" "${timeout_seconds}" "${script_content}"
+}
+
+schedule_host_shutdown_via_helper() {
+  local node="$1"
+  local service_name="$2"
+  local delay_seconds="$3"
+  local host_command
+  host_command="/usr/bin/systemd-run --unit hecate-shutdown-${service_name} --on-active=${delay_seconds}s /bin/sh -lc '/usr/bin/systemctl stop ${service_name} || true; /usr/bin/systemctl poweroff || true'"
+  run_host_command_via_helper "${node}" "shutdown-${node}-${service_name}" 120 "${host_command}"
+}
+
+prewarm_node_helper_image() {
+  local name="hecate-node-helper-prewarm"
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: prewarm ${NODE_HELPER_IMAGE} via temporary DaemonSet"
+    return 0
+  fi
+  cat <<DS | kubectl apply -f -
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: ${name}
+  namespace: ${NODE_HELPER_NAMESPACE}
+spec:
+  selector:
+    matchLabels:
+      app: ${name}
+  template:
+    metadata:
+      labels:
+        app: ${name}
+    spec:
+      imagePullSecrets:
+        - name: ${REGISTRY_PULL_SECRET}
+      tolerations:
+        - operator: Exists
+      containers:
+        - name: helper
+          image: ${NODE_HELPER_IMAGE}
+          imagePullPolicy: Always
+          command: ["/bin/sh", "-ceu", "sleep 300"]
+DS
+  local i desired ready
+  for i in $(seq 1 90); do
+    desired="$(kubectl -n "${NODE_HELPER_NAMESPACE}" get ds "${name}" -o jsonpath='{.status.desiredNumberScheduled}' 2>/dev/null || echo 0)"
+    ready="$(kubectl -n "${NODE_HELPER_NAMESPACE}" get ds "${name}" -o jsonpath='{.status.numberReady}' 2>/dev/null || echo 0)"
+    [[ -n "${desired}" ]] || desired=0
+    [[ -n "${ready}" ]] || ready=0
+    if [[ "${desired}" != "0" && "${desired}" == "${ready}" ]]; then
+      log "node-helper-prewarm=${ready}/${desired}"
+      kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${name}" --ignore-not-found >/dev/null 2>&1 || true
+      return 0
+    fi
+    sleep 2
+  done
+  kubectl -n "${NODE_HELPER_NAMESPACE}" describe ds "${name}" >&2 || true
+  kubectl -n "${NODE_HELPER_NAMESPACE}" get pods -l app="${name}" >&2 || true
+  kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${name}" --ignore-not-found >/dev/null 2>&1 || true
+  die "Timed out prewarming node helper image ${NODE_HELPER_IMAGE}"
+}
+
+start_bundle_server() {
+  [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle not found at ${HARBOR_BUNDLE_FILE}"
+  require_cmd python3
+  local bundle_dir bundle_name
+  bundle_dir="$(dirname "${HARBOR_BUNDLE_FILE}")"
+  bundle_name="$(basename "${HARBOR_BUNDLE_FILE}")"
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: serve ${bundle_name} from ${bundle_dir} on port ${BUNDLE_HTTP_PORT}"
+    return 0
+  fi
+  python3 -m http.server "${BUNDLE_HTTP_PORT}" --bind 0.0.0.0 --directory "${bundle_dir}" </dev/null >/tmp/hecate-bundle-server.log 2>&1 &
+  BUNDLE_SERVER_PID=$!
+  for _ in $(seq 1 20); do
+    if curl -fsS "http://127.0.0.1:${BUNDLE_HTTP_PORT}/${bundle_name}" >/dev/null 2>&1; then
+      return 0
+    fi
+    sleep 1
+  done
+  die "Temporary bundle server did not become ready; see /tmp/hecate-bundle-server.log"
+}
+
+stop_bundle_server() {
+  if [[ -n "${BUNDLE_SERVER_PID}" ]]; then
+    kill "${BUNDLE_SERVER_PID}" >/dev/null 2>&1 || true
+    for _ in $(seq 1 10); do
+      kill -0 "${BUNDLE_SERVER_PID}" >/dev/null 2>&1 || break
+      sleep 1
+    done
+    BUNDLE_SERVER_PID=""
+  fi
+}
+trap stop_bundle_server EXIT
+
+control_host_ip() {
+  hostname -I | awk '{print $1}'
+}
+
+seed_harbor_images() {
+  local images_text control_ip bundle_name script_content seed_rc=0
+  [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle not found at ${HARBOR_BUNDLE_FILE}"
+  images_text="$(sed '/^[[:space:]]*#/d;/^[[:space:]]*$/d' "${BOOTSTRAP_DIR}/harbor-bootstrap-images.txt")"
+  [[ -n "${images_text}" ]] || die "No Harbor images listed in ${BOOTSTRAP_DIR}/harbor-bootstrap-images.txt"
+  bundle_name="$(basename "${HARBOR_BUNDLE_FILE}")"
+  start_bundle_server
+  control_ip="$(control_host_ip)"
+  script_content=$(cat <<SCRIPT
+set -euo pipefail
+curl -fsSL "http://${control_ip}:${BUNDLE_HTTP_PORT}/${bundle_name}" \
+  | zstd -dc \
+  | nsenter --target 1 --mount --uts --ipc --net --pid /usr/local/bin/k3s ctr images import -
+while IFS= read -r image; do
+  [[ -z "\${image}" ]] && continue
+  nsenter --target 1 --mount --uts --ipc --net --pid /usr/local/bin/k3s ctr images ls | awk '{print \$1}' | grep -Fx "\${image}" >/dev/null
+done <<'IMAGES'
+${images_text}
+IMAGES
+SCRIPT
+)
+  run_helper_pod "${HARBOR_TARGET_NODE}" "harbor-seed" 900 "${script_content}" || seed_rc=$?
+  stop_bundle_server
+  [[ "${seed_rc}" -eq 0 ]] || return "${seed_rc}"
+  mark_checkpoint startup_harbor_seeded
 }
 
 bootstrap_local_minimal() {
-  # Local apply path to break Flux<->Gitea boot deadlock during cold-start recovery.
-  # Longhorn is applied before stateful workloads so astreae-backed PVCs can bind.
-  run kubectl apply -k infrastructure/core
-  run kubectl apply -k infrastructure/sources/helm
-  run kubectl apply -k infrastructure/longhorn/core
-  run kubectl apply -k infrastructure/metallb
-  run kubectl apply -k infrastructure/traefik
-  run kubectl apply -k infrastructure/vault-csi
-  run kubectl apply -k infrastructure/vault-injector
-  run kubectl apply -k services/vault
-  run kubectl apply -k infrastructure/postgres
-  run kubectl apply -k services/gitea
+  apply_kustomization infrastructure/core
+  apply_kustomization infrastructure/sources/helm
+  apply_kustomization infrastructure/longhorn/core
+  apply_kustomization infrastructure/metallb
+  apply_kustomization infrastructure/traefik
+  apply_kustomization infrastructure/vault-csi
+  apply_kustomization infrastructure/vault-injector
+  apply_kustomization services/vault
+  apply_kustomization infrastructure/postgres
+  apply_kustomization services/gitea
 }
 
 bootstrap_local_harbor() {
-  # Optional Harbor bootstrap stage for environments where Harbor is authoritative for images.
-  run kubectl apply -k services/harbor
+  apply_kustomization services/harbor
+}
+
+reconcile_stage() {
+  local stage_name="$1"
+  shift
+  if ! command -v flux >/dev/null 2>&1; then
+    local now
+    now="$(date --iso-8601=seconds)"
+    run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="${now}" --overwrite
+    return 0
+  fi
+  local item
+  for item in "$@"; do
+    run flux reconcile kustomization "${item}" -n flux-system --with-source --timeout=15m
+  done
+  mark_checkpoint "reconciled_${stage_name}"
 }
 
 resume_flux_and_reconcile() {
   patch_flux_suspend_all false
-
   if command -v flux >/dev/null 2>&1; then
     run flux reconcile source git flux-system -n flux-system --timeout=3m
-    run flux reconcile kustomization core -n flux-system --with-source --timeout=5m
-    run flux reconcile kustomization helm -n flux-system --with-source --timeout=5m
-    run flux reconcile kustomization longhorn -n flux-system --with-source --timeout=15m
-    run flux reconcile kustomization metallb -n flux-system --with-source --timeout=5m
-    run flux reconcile kustomization traefik -n flux-system --with-source --timeout=5m
-    run flux reconcile kustomization vault-csi -n flux-system --with-source --timeout=5m
-    run flux reconcile kustomization vault-injector -n flux-system --with-source --timeout=5m
-    run flux reconcile kustomization vault -n flux-system --with-source --timeout=10m
-    run flux reconcile kustomization postgres -n flux-system --with-source --timeout=10m
-    run flux reconcile kustomization gitea -n flux-system --with-source --timeout=10m
-    run flux reconcile kustomization harbor -n flux-system --with-source --timeout=15m
-  else
-    local now
-    now="$(date --iso-8601=seconds)"
-    run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="${now}" --overwrite
   fi
+  reconcile_stage core core helm longhorn metallb traefik vault-csi vault-injector
+  check_ingress_stack
+  check_longhorn_stack
+  reconcile_stage stateful vault postgres gitea
+  check_vault_stack
+  check_postgres_stack
+  check_gitea_stack
+  reconcile_stage registry harbor
+  check_harbor_stack
+  check_harbor_endpoint
+  run_harbor_pull_canary
 }
 
-as_array_from_csv "${CONTROL_PLANES}" CONTROL_PLANE_NODES
-if [[ -z "${WORKERS}" ]]; then
-  WORKERS="$(discover_workers_csv 2>/dev/null || true)"
-  if [[ -z "${WORKERS}" ]]; then
-    warn "Unable to auto-discover workers from the API; falling back to static atlas worker inventory."
-    WORKERS="${DEFAULT_WORKERS}"
+status_report() {
+  local battery flux_ready harbor_code workers
+  battery="$(read_ups_battery || true)"
+  flux_ready="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
+  harbor_code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)"
+  workers="$(discover_workers_csv 2>/dev/null || true)"
+  echo "mode=status"
+  echo "bundle_file=${HARBOR_BUNDLE_FILE}"
+  echo "bundle_present=$([[ -f "${HARBOR_BUNDLE_FILE}" ]] && echo true || echo false)"
+  echo "node_helper_image=${NODE_HELPER_IMAGE}"
+  echo "harbor_target_node=${HARBOR_TARGET_NODE}"
+  echo "workers=${workers}"
+  echo "recovery_pending=${RECOVERY_PENDING}"
+  echo "startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}"
+  echo "last_checkpoint=${LAST_CHECKPOINT}"
+  echo "ups_host=${UPS_HOST_IN_USE:-${UPS_HOST}}"
+  echo "ups_battery=${battery:-unknown}"
+  echo "flux_source_ready=${flux_ready:-unknown}"
+  echo "harbor_http=${harbor_code:-unknown}"
+  kubectl get ingressclass traefik >/dev/null 2>&1 && echo "traefik_ingressclass=true" || echo "traefik_ingressclass=false"
+  kubectl -n traefik get deploy traefik >/dev/null 2>&1 && echo "traefik_deploy=true" || echo "traefik_deploy=false"
+  kubectl -n longhorn-system get ds longhorn-manager >/dev/null 2>&1 && echo "longhorn_manager=true" || echo "longhorn_manager=false"
+  kubectl -n vault get sts vault >/dev/null 2>&1 && echo "vault_statefulset=true" || echo "vault_statefulset=false"
+  kubectl -n postgres get sts postgres >/dev/null 2>&1 && echo "postgres_statefulset=true" || echo "postgres_statefulset=false"
+  kubectl -n gitea get deploy gitea >/dev/null 2>&1 && echo "gitea_deploy=true" || echo "gitea_deploy=false"
+  kubectl -n harbor get deploy harbor-core >/dev/null 2>&1 && echo "harbor_deploy=true" || echo "harbor_deploy=false"
+}
+
+planned_shutdown() {
+  local workers_csv
+  workers_csv="$(discover_workers_csv 2>/dev/null || true)"
+  as_array_from_csv "${workers_csv}" WORKER_NODES
+  as_array_from_csv "titan-0a,titan-0b,titan-0c" CONTROL_PLANE_NODES
+
+  RECOVERY_PENDING=1
+  STARTUP_ATTEMPTED_DURING_OUTAGE=0
+  save_recovery_state 1 0 shutdown_started
+
+  if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
+    prewarm_node_helper_image
+    mark_checkpoint shutdown_helper_prewarmed
   fi
-fi
-as_array_from_csv "${WORKERS}" WORKER_NODES
-load_recovery_state
 
-log "mode=${MODE} execute=${EXECUTE}"
-log "control-planes=${CONTROL_PLANES}"
-log "workers=${WORKERS}"
-log "recovery-state-file=${RECOVERY_STATE_FILE}"
-log "recovery_pending=${RECOVERY_PENDING} startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}"
-report_flux_source_state
-
-if [[ "${MODE}" == "shutdown" ]]; then
-  save_recovery_state 1 0
   if [[ "${SKIP_ETCD_SNAPSHOT}" -eq 0 ]]; then
-    take_etcd_snapshot "${CONTROL_PLANE_NODES[0]}"
+    local ts
+    ts="$(date +%Y%m%d-%H%M%S)"
+    run_host_command_via_helper "${CONTROL_PLANE_NODES[0]}" "etcd-snapshot" 300 "/usr/local/bin/k3s etcd-snapshot save --name pre-shutdown-${ts}"
+    mark_checkpoint shutdown_snapshot_complete
   else
     warn "Skipping etcd snapshot by request."
   fi
 
   patch_flux_suspend_all true
   best_effort_scale_down_apps
+  mark_checkpoint shutdown_apps_scaled_down
 
   if [[ "${SKIP_DRAIN}" -eq 0 ]]; then
     best_effort_drain_workers "${DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}"
+    mark_checkpoint shutdown_workers_drained
   else
     warn "Skipping worker drain by request."
   fi
 
-  stop_workers_agents "${WORKER_NODES[@]}"
-  stop_control_planes "${CONTROL_PLANE_NODES[@]}"
+  local node
+  for node in "${WORKER_NODES[@]}"; do
+    [[ -z "${node}" ]] && continue
+    schedule_host_shutdown_via_helper "${node}" k3s-agent 20
+  done
+  mark_checkpoint shutdown_workers_scheduled
 
-  log "Shutdown flow complete."
-  exit 0
-fi
+  for node in "${CONTROL_PLANE_NODES[@]}"; do
+    [[ -z "${node}" ]] && continue
+    schedule_host_shutdown_via_helper "${node}" k3s 45
+  done
+  mark_checkpoint shutdown_control_planes_scheduled
+  log "Shutdown actions scheduled on hosts."
+}
 
-# Startup mode
-if [[ "${RECOVERY_PENDING}" -eq 1 ]]; then
-  if ! ensure_minimum_battery_for_bootstrap; then
-    if [[ "${STARTUP_ATTEMPTED_DURING_OUTAGE}" -eq 1 ]]; then
-      emergency_shutdown_after_outage
+emergency_shutdown_after_outage() {
+  warn "Entering outage-aware emergency shutdown path due insufficient startup budget."
+  patch_flux_suspend_all true || true
+  best_effort_scale_down_apps || true
+  local workers_csv
+  workers_csv="$(discover_workers_csv 2>/dev/null || true)"
+  as_array_from_csv "${workers_csv}" WORKER_NODES
+  best_effort_drain_workers "${EMERGENCY_DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}" || true
+  planned_shutdown
+}
+
+startup_flow() {
+  if [[ "${RECOVERY_PENDING}" -eq 1 ]]; then
+    if ! ensure_minimum_battery_for_bootstrap; then
+      if [[ "${STARTUP_ATTEMPTED_DURING_OUTAGE}" -eq 1 ]]; then
+        emergency_shutdown_after_outage
+        exit 1
+      fi
+      warn "Startup deferred due low battery after recent outage; marking for second-outage fallback."
+      save_recovery_state 1 1 deferred_low_battery
       exit 1
     fi
-    warn "Startup deferred due low battery after recent outage; marking for second-outage fallback."
-    save_recovery_state 1 1
-    exit 1
+    STARTUP_ATTEMPTED_DURING_OUTAGE=1
+    save_recovery_state 1 1 waiting_for_api
   fi
-  save_recovery_state 1 1
-fi
 
-start_control_planes "${CONTROL_PLANE_NODES[@]}"
-start_workers_agents "${WORKER_NODES[@]}"
+  if ! wait_for_api; then
+    die "Kubernetes API did not become reachable in time."
+  fi
+  mark_checkpoint startup_api_ready
 
-if ! wait_for_api 120 2; then
-  warn "Kubernetes API did not become reachable in time."
-  exit 1
-fi
+  if [[ -n "${FORCE_FLUX_BRANCH}" ]]; then
+    run kubectl -n flux-system patch gitrepository flux-system --type=merge -p "{\"spec\":{\"ref\":{\"branch\":\"${FORCE_FLUX_BRANCH}\"}}}"
+    mark_checkpoint startup_flux_branch_forced
+  fi
 
-if [[ -n "${FORCE_FLUX_BRANCH}" ]]; then
-  run kubectl -n flux-system patch gitrepository flux-system --type=merge \
-    -p "{\"spec\":{\"ref\":{\"branch\":\"${FORCE_FLUX_BRANCH}\"}}}"
-fi
+  if [[ "${SKIP_LOCAL_BOOTSTRAP}" -eq 0 ]]; then
+    if ! kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q True; then
+      warn "Flux source not Ready; executing local bootstrap fallback path."
+      bootstrap_local_minimal
+      mark_checkpoint startup_local_bootstrap_complete
+      check_ingress_stack
+      check_longhorn_stack
+      check_vault_stack
+      check_postgres_stack
+      check_gitea_stack
 
-if [[ "${SKIP_LOCAL_BOOTSTRAP}" -eq 0 ]]; then
-  # If source is not ready, bootstrap critical pieces from local checkout first.
-  if ! kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q True; then
-    warn "Flux source not Ready; executing local bootstrap fallback path."
-    bootstrap_local_minimal
-    if [[ "${SKIP_HARBOR_BOOTSTRAP}" -eq 0 ]]; then
-      bootstrap_local_harbor
-    else
-      warn "Skipping Harbor bootstrap fallback by request."
+      if [[ "${SKIP_HARBOR_BOOTSTRAP}" -eq 0 ]]; then
+        if harbor_is_ready; then
+          log "Harbor already healthy; skipping Harbor seed/bootstrap."
+        else
+          if [[ "${SKIP_HARBOR_SEED}" -eq 0 ]]; then
+            if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
+              prewarm_node_helper_image
+            fi
+            seed_harbor_images
+          else
+            warn "Skipping Harbor seed/import by request."
+          fi
+          bootstrap_local_harbor
+          mark_checkpoint startup_local_harbor_applied
+          check_harbor_stack
+          check_harbor_endpoint
+        fi
+      else
+        warn "Skipping Harbor bootstrap fallback by request."
+      fi
     fi
+  else
+    warn "Skipping local bootstrap fallback by request."
   fi
-else
-  warn "Skipping local bootstrap fallback by request."
-fi
 
-resume_flux_and_reconcile
-clear_recovery_state
-log "Startup flow complete."
+  resume_flux_and_reconcile
+  if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
+    prewarm_node_helper_image
+    mark_checkpoint startup_helper_prewarmed
+  fi
+  clear_recovery_state
+  log "Startup flow complete."
+}
+
+prepare_flow() {
+  [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle missing at ${HARBOR_BUNDLE_FILE}. Build and copy it to the canonical control host first."
+  if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
+    prewarm_node_helper_image
+    mark_checkpoint prepare_helper_prewarmed
+  fi
+  log "Prepare flow complete."
+}
+
+harbor_seed_flow() {
+  [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle missing at ${HARBOR_BUNDLE_FILE}. Build and copy it to the canonical control host first."
+  if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
+    prewarm_node_helper_image
+    mark_checkpoint harbor_seed_helper_prewarmed
+  fi
+  seed_harbor_images
+  check_harbor_endpoint
+  run_harbor_pull_canary
+  log "Harbor seed flow complete."
+}
+
+load_recovery_state
+log "mode=${MODE} execute=${EXECUTE}"
+log "recovery-state-file=${RECOVERY_STATE_FILE}"
+log "bundle-file=${HARBOR_BUNDLE_FILE}"
+log "node-helper-image=${NODE_HELPER_IMAGE}"
+report_flux_source_state
+
+case "${MODE}" in
+  status)
+    status_report
+    ;;
+  prepare)
+    prepare_flow
+    ;;
+  harbor-seed)
+    harbor_seed_flow
+    ;;
+  shutdown)
+    planned_shutdown
+    ;;
+  startup)
+    startup_flow
+    ;;
+esac