diff --git a/README.md b/README.md
index 15dc3778..859ac256 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,80 @@
 # titan-iac
 
-Flux-managed Kubernetes cluster for bstein.dev services.
+Flux-managed Kubernetes cluster config for bstein.dev.
+
+Canonical repo URL:
+- `ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git`
+
+## Why `ananke`
+
+`Ananke` is inevitability and constraint. That is exactly what this tooling is for:
+- power events happen
+- recovery windows are finite
+- bootstrap has to be deterministic
+
+The point is not clever automation. The point is boring, repeatable recovery.
+
+## Power Domains
+
+Two UPS domains matter during shutdown/startup drills:
+- `Statera`: `titan-23`, `titan-24`, `titan-jh`
+- `Pyrphoros`: all other nodes
+
+Default UPS checks in Ananke read from `Pyrphoros` (`pyrphoros@localhost`) unless overridden.
+
+## Breakglass
+
+If primary operator access is lost, breakglass is on the remote Magic Mirror.
+
+## Ananke Commands
+
+Ananke is the recovery orchestrator. Flux desired-state source remains `titan-iac.git`.
+
+Use `titan-db` as the canonical control host. `tethys` (`titan-24`) is the backup operator host.
+
+From `titan-db`:
+
+```bash
+~/ananke-cluster-power status
+~/ananke-cluster-power prepare --execute
+~/ananke-cluster-power shutdown --execute --require-ups-battery
+~/ananke-cluster-power startup --execute --force-flux-branch main --require-ups-battery
+```
+
+From `tethys` / `titan-24` (delegating to `titan-db`):
+
+```bash
+~/ananke-tools/cluster_power_console.sh --delegate-host titan-db status
+~/ananke-tools/cluster_power_console.sh --delegate-host titan-db prepare --execute
+~/ananke-tools/cluster_power_console.sh --delegate-host titan-db shutdown --execute --require-ups-battery
+~/ananke-tools/cluster_power_console.sh --delegate-host titan-db startup --execute --force-flux-branch main --require-ups-battery
+```
+
+## Shutdown Modes
+
+`cluster_power_recovery.sh` supports two shutdown behaviors:
+- `--shutdown-mode host-poweroff` (default): graceful cluster shutdown plus scheduled host poweroff.
+- `--shutdown-mode cluster-only`: graceful cluster shutdown without host poweroff (stops `k3s` / `k3s-agent` only).
+
+## Startup Completion Rules
+
+Ananke startup is not “done” just because Flux says green once.
+
+Startup now completes only after:
+- Flux source drift checks pass (expected URL and branch)
+- all non-optional Flux kustomizations report `Ready=True`
+- external service checklist passes (default includes Gitea, Grafana, Harbor)
+- generated ingress reachability checks pass (default accepted statuses: `200,301,302,307,308,401,403,404`)
+- a stability soak window passes with no `CrashLoopBackOff` / image-pull failures and checklist still healthy
+
+If you intentionally need to correct Flux source during recovery, use:
+- `--force-flux-url ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git`
+- `--force-flux-branch main`
+
+`--force-flux-url` is breakglass-only and requires `--allow-flux-source-mutation`.
+
+The defaults live in:
+- `scripts/bootstrap/recovery-config.env`
+
+Detailed runbook:
+- `knowledge/runbooks/cluster-power-recovery.md`
diff --git a/clusters/atlas/flux-system/gotk-sync.yaml b/clusters/atlas/flux-system/gotk-sync.yaml
index 7593064c..a793e031 100644
--- a/clusters/atlas/flux-system/gotk-sync.yaml
+++ b/clusters/atlas/flux-system/gotk-sync.yaml
@@ -9,7 +9,7 @@ metadata:
 spec:
   interval: 1m0s
   ref:
-    branch: feature/atlasbot
+    branch: main
   secretRef:
     name: flux-system-gitea
   url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
diff --git a/dockerfiles/Dockerfile.hecate-node-helper b/dockerfiles/Dockerfile.ananke-node-helper
similarity index 100%
rename from dockerfiles/Dockerfile.hecate-node-helper
rename to dockerfiles/Dockerfile.ananke-node-helper
diff --git a/knowledge/runbooks/cluster-power-recovery.md b/knowledge/runbooks/cluster-power-recovery.md
index eb4ab395..67a10d3d 100644
--- a/knowledge/runbooks/cluster-power-recovery.md
+++ b/knowledge/runbooks/cluster-power-recovery.md
@@ -45,33 +45,37 @@ Execute examples
 Manual remote console examples
 - Canonical operator hosts:
   - `titan-db`
-  - `titan-24`
+  - `tethys` (`titan-24`)
 - Both hosts now have:
-  - `~/hecate-tools/cluster_power_recovery.sh`
-  - `~/hecate-tools/cluster_power_console.sh`
-  - `~/hecate-tools/bootstrap/recovery-config.env`
-  - `~/hecate-tools/bootstrap/harbor-bootstrap-images.txt`
-  - `~/hecate-tools/kubeconfig`
-  - `~/hecate-cluster-power`
-  - `~/bin/hecate-cluster-power`
-  - `~/hecate-repo/{infrastructure,services,scripts}`
+  - `~/ananke-tools/cluster_power_recovery.sh`
+  - `~/ananke-tools/cluster_power_console.sh`
+  - `~/ananke-tools/bootstrap/recovery-config.env`
+  - `~/ananke-tools/bootstrap/harbor-bootstrap-images.txt`
+  - `~/ananke-tools/kubeconfig`
+  - `~/ananke-cluster-power`
+  - `~/bin/ananke-cluster-power`
+  - `~/ananke-repo/{infrastructure,services,scripts}`
 - Both hosts also keep the Harbor bootstrap bundle at:
-  - `~/.local/share/hecate/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst`
+  - `~/.local/share/ananke/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst`
 - Remote usage:
   - `ssh titan-db`
-  - `~/hecate-cluster-power status`
-  - `~/hecate-cluster-power prepare --execute`
-  - `~/hecate-cluster-power shutdown --execute`
-  - `~/hecate-cluster-power startup --execute --force-flux-branch main`
-  - `ssh titan-24`
-  - `~/hecate-cluster-power status`
-  - `~/hecate-cluster-power prepare --execute`
-  - `~/hecate-cluster-power shutdown --execute`
-  - `~/hecate-cluster-power startup --execute --force-flux-branch main`
+  - `~/ananke-cluster-power status`
+  - `~/ananke-cluster-power prepare --execute`
+  - `~/ananke-cluster-power shutdown --execute`
+  - `~/ananke-cluster-power startup --execute --force-flux-branch main`
+  - `ssh tethys`
+  - `~/ananke-cluster-power status`
+  - `~/ananke-cluster-power prepare --execute`
+  - `~/ananke-cluster-power shutdown --execute`
+  - `~/ananke-cluster-power startup --execute --force-flux-branch main`
 
 Useful options
+- `--shutdown-mode host-poweroff|cluster-only`
 - `--expected-flux-branch main`
+- `--expected-flux-url ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git`
+- `--force-flux-url ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git`
 - `--force-flux-branch main`
+- `--allow-flux-source-mutation` (required with `--force-flux-url`; breakglass only)
 - `--skip-local-bootstrap` (not recommended for cold-start recovery)
 - `--skip-harbor-bootstrap` (skip the Harbor recovery stage if you know Harbor should stay deferred)
 - `--skip-harbor-seed` (skip bundle import if Harbor images are already cached on the target node)
@@ -81,8 +85,12 @@ Useful options
 - `--require-ups-battery`
 - `--drain-timeout 180`
 - `--emergency-drain-timeout 45`
-- `--recovery-state-file ~/.local/share/hecate/cluster_power_recovery.state`
-- `--harbor-bundle-file ~/.local/share/hecate/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst`
+- `--flux-ready-timeout 1200`
+- `--startup-checklist-timeout 900`
+- `--startup-stability-window 180`
+- `--startup-stability-timeout 900`
+- `--recovery-state-file ~/.local/share/ananke/cluster_power_recovery.state`
+- `--harbor-bundle-file ~/.local/share/ananke/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst`
 
 Controlled drill checklist (recommended)
 - Operator host: use `titan-db` as canonical control host for the drill.
@@ -91,37 +99,48 @@ Controlled drill checklist (recommended)
   - Confirm they will manually power cluster nodes back on after shutdown completes.
   - Confirm who will announce "all nodes powered on" to resume startup.
 - Preflight on `titan-db`:
-  - `mkdir -p ~/hecate-logs`
-  - `~/hecate-cluster-power status` and verify:
+  - `mkdir -p ~/ananke-logs`
+  - `~/ananke-cluster-power status` and verify:
     - `ups_host=pyrphoros@localhost`
     - `ups_battery` is numeric
     - `flux_source_ready=True`
 - Warm helper image just before shutdown:
-  - `~/hecate-cluster-power prepare --execute`
+  - `~/ananke-cluster-power prepare --execute`
 - Run in a persistent shell and capture logs:
-  - `tmux new -s hecate-drill`
-  - `script -q -a ~/hecate-logs/hecate-drill-$(date +%Y%m%d-%H%M%S).log`
+  - `tmux new -s ananke-drill`
+  - `script -q -a ~/ananke-logs/ananke-drill-$(date +%Y%m%d-%H%M%S).log`
 - Execute controlled shutdown with telemetry enforcement:
-  - `~/hecate-cluster-power shutdown --execute --require-ups-battery`
+  - `~/ananke-cluster-power shutdown --execute --require-ups-battery`
 - After on-site power-on confirmation, execute startup:
-  - `~/hecate-cluster-power startup --execute --force-flux-branch main --require-ups-battery`
+  - `~/ananke-cluster-power startup --execute --force-flux-branch main --require-ups-battery`
 - Post-check:
-  - `~/hecate-cluster-power status`
+  - `~/ananke-cluster-power status`
   - Verify critical services (`longhorn`, `vault`, `postgres`, `gitea`, `harbor`, `pegasus`) and no widespread pull/crash failures.
 
 Operational notes
 - The flow suspends Flux Kustomizations/HelmReleases during shutdown to prevent churn.
+- Shutdown behavior is explicit:
+  - `host-poweroff` schedules host poweroff after service stop.
+  - `cluster-only` stops `k3s`/`k3s-agent` without powering hosts off.
 - Worker drain is no longer best-effort only. The script now escalates from normal drain, to `--force`, to `--disable-eviction` once the configured timeout is exhausted.
-- During startup, if Flux source is not `Ready`, local bootstrap fallback is applied first using the repo snapshot under `~/hecate-repo`.
+- Startup fails fast if Flux source URL/branch drift from expected values (unless branch override is explicitly requested with `--force-flux-branch`).
+- Flux desired-state source remains `titan-iac.git`. Ananke orchestrates runtime recovery and should not be used as the normal Flux source repo.
+- During startup, if Flux source is not `Ready`, local bootstrap fallback is applied first using the repo snapshot under `~/ananke-repo`.
 - Longhorn is reconciled before Vault/Postgres/Gitea so storage-backed services are not racing the volume layer.
 - Harbor is reconciled after the first critical stateful services.
 - Harbor bootstrap is now designed around a control-host bundle:
   - Build the Harbor bundle locally with `scripts/build_harbor_bootstrap_bundle.sh`.
-  - Stage it on the operator host at `~/.local/share/hecate/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst`.
+  - Stage it on the operator host at `~/.local/share/ananke/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst`.
   - Use `harbor-seed --execute` or a full `startup --execute` to stream/import that bundle onto `titan-05`.
-- The Harbor bundle remains arm64-only because Harbor is pinned to arm64 nodes. The node-helper image is multi-arch because Hecate uses it across both arm64 and amd64 nodes during prepare/shutdown operations.
-- Hecate uses a temporary privileged helper pod for host-side operations. The helper image is prewarmed with `prepare --execute` so later shutdown/startup steps do not stall on image pulls.
-- The script persists outage state in `~/.local/state/cluster_power_recovery.state` by default. If startup is attempted during an outage window and power becomes unstable again, rerunning startup with insufficient UPS charge will flip into the emergency shutdown path instead of continuing to bootstrap.
+- The Harbor bundle remains arm64-only because Harbor is pinned to arm64 nodes. The node-helper image is multi-arch because Ananke uses it across both arm64 and amd64 nodes during prepare/shutdown operations.
+- Ananke uses a temporary privileged helper pod for host-side operations. The helper image is prewarmed with `prepare --execute` so later shutdown/startup steps do not stall on image pulls.
+- The script persists outage state in `~/.local/share/ananke/cluster_power_recovery.state` by default. If startup is attempted during an outage window and power becomes unstable again, rerunning startup with insufficient UPS charge will flip into the emergency shutdown path instead of continuing to bootstrap.
+- Startup completion is strict now:
+  - all non-optional Flux kustomizations must be `Ready=True`
+  - external service checklist must pass (defaults include Gitea, Grafana, Harbor)
+  - generated ingress reachability checks must pass (default accepted codes: `200,301,302,307,308,401,403,404`)
+  - stability soak must pass with no crashloop/pull-failure churn
+- If Flux hits immutable one-off Job drift during reconcile, Ananke now attempts self-heal by pruning failed Flux-managed Jobs and retrying reconcile.
 - In dry-run mode, the script now skips the live API wait step so preview runs do not stall on an offline cluster.
 - Dry-run mode no longer mutates outage recovery state.
 - `harbor-seed --execute` was validated by:
diff --git a/scripts/bootstrap/recovery-config.env b/scripts/bootstrap/recovery-config.env
index c2f789d9..a8bae234 100644
--- a/scripts/bootstrap/recovery-config.env
+++ b/scripts/bootstrap/recovery-config.env
@@ -1,14 +1,36 @@
 CANONICAL_CONTROL_HOST="titan-db"
 DEFAULT_FLUX_BRANCH="main"
-STATE_SUBDIR=".local/share/hecate"
+EXPECTED_FLUX_URL="ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
+SHUTDOWN_MODE="host-poweroff"
+STATE_SUBDIR=".local/share/ananke"
 HARBOR_BUNDLE_BASENAME="harbor-bootstrap-v2.14.1-arm64.tar.zst"
-HARBOR_TARGET_NODE="titan-05"
-HARBOR_CANARY_NODE="titan-04"
+HARBOR_TARGET_NODE=""
+HARBOR_CANARY_NODE=""
+HARBOR_HOST_LABEL_KEY="ananke.bstein.dev/harbor-bootstrap"
 HARBOR_CANARY_IMAGE="registry.bstein.dev/bstein/kubectl:1.35.0"
-NODE_HELPER_IMAGE="registry.bstein.dev/bstein/hecate-node-helper:0.1.0"
+NODE_HELPER_IMAGE="registry.bstein.dev/bstein/ananke-node-helper:0.1.0"
 NODE_HELPER_NAMESPACE="maintenance"
 NODE_HELPER_SERVICE_ACCOUNT="default"
 REGISTRY_PULL_SECRET="harbor-regcred"
 BUNDLE_HTTP_PORT="8877"
 UPS_HOST="pyrphoros@localhost"
 UPS_BATTERY_KEY="battery.charge"
+FLUX_READY_TIMEOUT_SECONDS="1200"
+FLUX_READY_POLL_SECONDS="10"
+STARTUP_CHECKLIST_TIMEOUT_SECONDS="900"
+STARTUP_CHECKLIST_POLL_SECONDS="10"
+STARTUP_WORKLOAD_TIMEOUT_SECONDS="900"
+STARTUP_WORKLOAD_POLL_SECONDS="10"
+STARTUP_STABILITY_WINDOW_SECONDS="180"
+STARTUP_STABILITY_TIMEOUT_SECONDS="900"
+STARTUP_STABILITY_POLL_SECONDS="10"
+STARTUP_OPTIONAL_KUSTOMIZATIONS=""
+STARTUP_IGNORE_PODS_REGEX=""
+STARTUP_IGNORE_WORKLOADS_REGEX=""
+STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX="^(kube-system|kube-public|kube-node-lease|flux-system)$"
+STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="10"
+STARTUP_INCLUDE_INGRESS_CHECKS="1"
+STARTUP_INGRESS_ALLOWED_STATUSES="200,301,302,307,308,401,403,404"
+STARTUP_IGNORE_INGRESS_HOSTS_REGEX=""
+STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="10"
+STARTUP_SERVICE_CHECKLIST='gitea|https://scm.bstein.dev/api/healthz|200|"status":"pass"||;grafana|https://metrics.bstein.dev/api/health|200|"database":"ok"||;harbor|https://registry.bstein.dev/v2/|200,401|||'
diff --git a/scripts/build_hecate_node_helper.sh b/scripts/build_ananke_node_helper.sh
similarity index 83%
rename from scripts/build_hecate_node_helper.sh
rename to scripts/build_ananke_node_helper.sh
index 148f51c5..8d03b22d 100755
--- a/scripts/build_hecate_node_helper.sh
+++ b/scripts/build_ananke_node_helper.sh
@@ -1,10 +1,10 @@
 #!/usr/bin/env bash
 set -euo pipefail
 
-IMAGE="registry.bstein.dev/bstein/hecate-node-helper:0.1.0"
+IMAGE="registry.bstein.dev/bstein/ananke-node-helper:0.1.0"
 DOCKER_CONFIG_PATH=""
 PLATFORMS="linux/amd64,linux/arm64"
-BUILDER_NAME="hecate-node-helper-builder"
+BUILDER_NAME="ananke-node-helper-builder"
 
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -26,7 +26,7 @@ while [[ $# -gt 0 ]]; do
       ;;
     -h|--help)
       cat <<USAGE
-Usage: scripts/build_hecate_node_helper.sh [--image <image>] [--docker-config <path>] [--platforms <csv>] [--builder <name>]
+Usage: scripts/build_ananke_node_helper.sh [--image <image>] [--docker-config <path>] [--platforms <csv>] [--builder <name>]
 USAGE
       exit 0
       ;;
@@ -50,7 +50,7 @@ fi
 docker buildx inspect --bootstrap >/dev/null
 docker buildx build \
   --platform "${PLATFORMS}" \
-  -f dockerfiles/Dockerfile.hecate-node-helper \
+  -f dockerfiles/Dockerfile.ananke-node-helper \
   -t "${IMAGE}" \
   --push \
   .
diff --git a/scripts/cluster_power_console.sh b/scripts/cluster_power_console.sh
index 6bc8cd7e..6d8ce679 100755
--- a/scripts/cluster_power_console.sh
+++ b/scripts/cluster_power_console.sh
@@ -7,11 +7,11 @@ Usage:
   scripts/cluster_power_console.sh [--repo-dir <path>] [--delegate-host <host>] [--allow-local] <prepare|status|shutdown|startup> [recovery-script-options...]
 
 Purpose:
-  Friendly manual entrypoint for running Hecate from a remote console.
+  Friendly manual entrypoint for running Ananke from a remote console.
   The canonical control host is titan-db by default so bundle/state handling stays in one place.
 
 Defaults:
-  --repo-dir       \$HOME/Development/titan-iac
+  --repo-dir       \$HOME/Development/ananke (fallback: \$HOME/Development/titan-iac)
   --delegate-host  titan-db
 
 Examples:
@@ -22,10 +22,14 @@ Examples:
 USAGE
 }
 
-REPO_DIR="${HOME}/Development/titan-iac"
+if [[ -d "${HOME}/Development/ananke" ]]; then
+  REPO_DIR="${HOME}/Development/ananke"
+else
+  REPO_DIR="${HOME}/Development/titan-iac"
+fi
 DELEGATE_HOST="titan-db"
 ALLOW_LOCAL=0
-REMOTE_REPO_DIR="${HECATE_REMOTE_REPO_DIR:-}"
+REMOTE_REPO_DIR="${ANANKE_REMOTE_REPO_DIR:-}"
 
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -73,6 +77,6 @@ fi
 quoted_args="$(printf '%q ' "$@")"
 remote_prefix=""
 if [[ -n "${REMOTE_REPO_DIR}" ]]; then
-  remote_prefix="HECATE_REPO_DIR=$(printf '%q' "${REMOTE_REPO_DIR}") "
+  remote_prefix="ANANKE_REPO_DIR=$(printf '%q' "${REMOTE_REPO_DIR}") "
 fi
-exec ssh -o BatchMode=yes -o ConnectTimeout=8 "${DELEGATE_HOST}" "${remote_prefix}~/hecate-tools/cluster_power_recovery.sh ${quoted_args}"
+exec ssh -o BatchMode=yes -o ConnectTimeout=8 "${DELEGATE_HOST}" "${remote_prefix}~/ananke-tools/cluster_power_recovery.sh ${quoted_args}"
diff --git a/scripts/cluster_power_recovery.sh b/scripts/cluster_power_recovery.sh
index 9efafff1..0081a0e7 100755
--- a/scripts/cluster_power_recovery.sh
+++ b/scripts/cluster_power_recovery.sh
@@ -2,7 +2,7 @@
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_DIR="${HECATE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}"
+REPO_DIR="${ANANKE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}"
 BOOTSTRAP_DIR="${SCRIPT_DIR}/bootstrap"
 CONFIG_FILE="${BOOTSTRAP_DIR}/recovery-config.env"
 if [[ -f "${CONFIG_FILE}" ]]; then
@@ -20,7 +20,11 @@ Usage:
 
 Options:
   --execute                     Actually run commands (default is dry-run)
+  --shutdown-mode <mode>        Shutdown behavior: host-poweroff or cluster-only (default: ${SHUTDOWN_MODE:-host-poweroff})
   --expected-flux-branch <name> Expected Flux source branch during startup checks (default: ${DEFAULT_FLUX_BRANCH:-main})
+  --expected-flux-url <url>     Expected Flux source URL during startup checks
+  --allow-flux-source-mutation  Required to allow --force-flux-url during startup
+  --force-flux-url <url>        Startup: patch flux-system GitRepository URL to this value
   --force-flux-branch <name>    Startup: patch flux-system GitRepository branch to this value
   --skip-etcd-snapshot          Shutdown: skip etcd snapshot before shutdown
   --skip-drain                  Shutdown: skip worker drain during shutdown
@@ -32,15 +36,29 @@ Options:
   --ups-host <name>             UPS identifier for upsc (default: ups@localhost)
   --ups-battery-key <key>       UPS battery key for upsc (default: battery.charge)
   --recovery-state-file <path>  Recovery state file for outage-aware restart logic
+  --replica-snapshot-file <path>
+                               File used to persist workload replica snapshot across shutdown/startup
   --harbor-bundle-file <path>   Harbor bootstrap bundle on the control host
-  --harbor-target-node <name>   Node that should host Harbor during bootstrap (default: ${HARBOR_TARGET_NODE:-titan-05})
+  --harbor-target-node <name>   Node that should host Harbor during bootstrap (default: auto)
+  --harbor-canary-node <name>   Node used for Harbor pull canary (default: auto)
+  --harbor-host-label-key <key> Node label key used to pin Harbor bootstrap workloads (default: ${HARBOR_HOST_LABEL_KEY:-ananke.bstein.dev/harbor-bootstrap})
   --harbor-canary-image <image> Harbor-backed image used for pull canary (default: ${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0})
-  --node-helper-image <image>   Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0})
+  --node-helper-image <image>   Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/ananke-node-helper:0.1.0})
   --bundle-http-port <port>     Temporary HTTP port used to serve bootstrap bundles (default: ${BUNDLE_HTTP_PORT:-8877})
   --api-wait-timeout <seconds>  Startup: Kubernetes API wait timeout (default: 600)
   --drain-timeout <seconds>     Worker drain timeout for normal shutdown (default: 180)
   --emergency-drain-timeout <seconds>
                                Worker drain timeout for emergency fallback (default: 45)
+  --flux-ready-timeout <seconds>
+                               Startup: max time to wait for Flux kustomizations Ready (default: 1200)
+  --startup-checklist-timeout <seconds>
+                               Startup: max time to wait for external service checklist (default: 900)
+  --startup-workload-timeout <seconds>
+                               Startup: max time to wait for workload readiness checks (default: 900)
+  --startup-stability-window <seconds>
+                               Startup: continuous healthy window required before success (default: 180)
+  --startup-stability-timeout <seconds>
+                               Startup: max time allowed to achieve the healthy window (default: 900)
   --require-ups-battery         Hard-fail startup if UPS battery cannot be read
   -h, --help                    Show help
 
@@ -70,7 +88,11 @@ case "${MODE}" in
 esac
 
 EXECUTE=0
+SHUTDOWN_MODE="${SHUTDOWN_MODE:-host-poweroff}"
 EXPECTED_FLUX_BRANCH="${DEFAULT_FLUX_BRANCH:-main}"
+EXPECTED_FLUX_URL="${EXPECTED_FLUX_URL:-ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git}"
+ALLOW_FLUX_SOURCE_MUTATION=0
+FORCE_FLUX_URL=""
 FORCE_FLUX_BRANCH=""
 SKIP_ETCD_SNAPSHOT=0
 SKIP_DRAIN=0
@@ -85,17 +107,41 @@ REQUIRE_UPS_BATTERY="${REQUIRE_UPS_BATTERY:-0}"
 DRAIN_TIMEOUT_SECONDS=180
 EMERGENCY_DRAIN_TIMEOUT_SECONDS=45
 API_WAIT_TIMEOUT_SECONDS=600
+FLUX_READY_TIMEOUT_SECONDS="${FLUX_READY_TIMEOUT_SECONDS:-1200}"
+FLUX_READY_POLL_SECONDS="${FLUX_READY_POLL_SECONDS:-10}"
+STARTUP_CHECKLIST_TIMEOUT_SECONDS="${STARTUP_CHECKLIST_TIMEOUT_SECONDS:-900}"
+STARTUP_CHECKLIST_POLL_SECONDS="${STARTUP_CHECKLIST_POLL_SECONDS:-10}"
+STARTUP_WORKLOAD_TIMEOUT_SECONDS="${STARTUP_WORKLOAD_TIMEOUT_SECONDS:-900}"
+STARTUP_WORKLOAD_POLL_SECONDS="${STARTUP_WORKLOAD_POLL_SECONDS:-10}"
+STARTUP_STABILITY_WINDOW_SECONDS="${STARTUP_STABILITY_WINDOW_SECONDS:-180}"
+STARTUP_STABILITY_TIMEOUT_SECONDS="${STARTUP_STABILITY_TIMEOUT_SECONDS:-900}"
+STARTUP_STABILITY_POLL_SECONDS="${STARTUP_STABILITY_POLL_SECONDS:-10}"
+STARTUP_IGNORE_PODS_REGEX="${STARTUP_IGNORE_PODS_REGEX:-}"
+STARTUP_IGNORE_WORKLOADS_REGEX="${STARTUP_IGNORE_WORKLOADS_REGEX:-}"
+STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX="${STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system)$}"
+STARTUP_OPTIONAL_KUSTOMIZATIONS="${STARTUP_OPTIONAL_KUSTOMIZATIONS:-}"
+STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS:-10}"
+STARTUP_SERVICE_CHECKLIST="${STARTUP_SERVICE_CHECKLIST:-}"
+STARTUP_INCLUDE_INGRESS_CHECKS="${STARTUP_INCLUDE_INGRESS_CHECKS:-1}"
+STARTUP_INGRESS_ALLOWED_STATUSES="${STARTUP_INGRESS_ALLOWED_STATUSES:-200,301,302,307,308,401,403,404}"
+STARTUP_IGNORE_INGRESS_HOSTS_REGEX="${STARTUP_IGNORE_INGRESS_HOSTS_REGEX:-}"
+STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="${STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS:-10}"
+SHUTDOWN_NAMESPACE_EXCLUDES_REGEX="${SHUTDOWN_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$}"
 BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}"
-STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/hecate}"
+STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/ananke}"
 RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state"
+REPLICA_SNAPSHOT_FILE="${STATE_ROOT}/desired_workload_replicas.tsv"
 HARBOR_BUNDLE_FILE="${STATE_ROOT}/bundles/${HARBOR_BUNDLE_BASENAME:-harbor-bootstrap-v2.14.1-arm64.tar.zst}"
-HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-titan-05}"
-HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-titan-04}"
+HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-}"
+HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-}"
+HARBOR_HOST_LABEL_KEY="${HARBOR_HOST_LABEL_KEY:-ananke.bstein.dev/harbor-bootstrap}"
 HARBOR_CANARY_IMAGE="${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}"
-NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0}"
+NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/ananke-node-helper:0.1.0}"
 NODE_HELPER_NAMESPACE="${NODE_HELPER_NAMESPACE:-maintenance}"
 NODE_HELPER_SERVICE_ACCOUNT="${NODE_HELPER_SERVICE_ACCOUNT:-default}"
+NODE_HELPER_PREWARM_DS="${NODE_HELPER_PREWARM_DS:-ananke-node-helper-prewarm}"
 REGISTRY_PULL_SECRET="${REGISTRY_PULL_SECRET:-harbor-regcred}"
+KEEP_PREWARM_DAEMONSET=0
 
 RECOVERY_PENDING=0
 STARTUP_ATTEMPTED_DURING_OUTAGE=0
@@ -109,10 +155,26 @@ while [[ $# -gt 0 ]]; do
       EXECUTE=1
       shift
       ;;
+    --shutdown-mode)
+      SHUTDOWN_MODE="${2:?missing shutdown mode}"
+      shift 2
+      ;;
     --expected-flux-branch)
       EXPECTED_FLUX_BRANCH="${2:?missing branch}"
       shift 2
       ;;
+    --expected-flux-url)
+      EXPECTED_FLUX_URL="${2:?missing flux url}"
+      shift 2
+      ;;
+    --allow-flux-source-mutation)
+      ALLOW_FLUX_SOURCE_MUTATION=1
+      shift
+      ;;
+    --force-flux-url)
+      FORCE_FLUX_URL="${2:?missing flux url}"
+      shift 2
+      ;;
     --force-flux-branch)
       FORCE_FLUX_BRANCH="${2:?missing branch}"
       shift 2
@@ -161,6 +223,10 @@ while [[ $# -gt 0 ]]; do
       RECOVERY_STATE_FILE="${2:?missing state file path}"
       shift 2
       ;;
+    --replica-snapshot-file)
+      REPLICA_SNAPSHOT_FILE="${2:?missing replica snapshot file path}"
+      shift 2
+      ;;
     --harbor-bundle-file)
       HARBOR_BUNDLE_FILE="${2:?missing bundle file path}"
       shift 2
@@ -169,6 +235,14 @@ while [[ $# -gt 0 ]]; do
       HARBOR_TARGET_NODE="${2:?missing harbor target node}"
       shift 2
       ;;
+    --harbor-canary-node)
+      HARBOR_CANARY_NODE="${2:?missing harbor canary node}"
+      shift 2
+      ;;
+    --harbor-host-label-key)
+      HARBOR_HOST_LABEL_KEY="${2:?missing harbor host label key}"
+      shift 2
+      ;;
     --harbor-canary-image)
       HARBOR_CANARY_IMAGE="${2:?missing canary image}"
       shift 2
@@ -185,6 +259,26 @@ while [[ $# -gt 0 ]]; do
       API_WAIT_TIMEOUT_SECONDS="${2:?missing api wait timeout}"
       shift 2
       ;;
+    --flux-ready-timeout)
+      FLUX_READY_TIMEOUT_SECONDS="${2:?missing flux ready timeout}"
+      shift 2
+      ;;
+    --startup-checklist-timeout)
+      STARTUP_CHECKLIST_TIMEOUT_SECONDS="${2:?missing startup checklist timeout}"
+      shift 2
+      ;;
+    --startup-workload-timeout)
+      STARTUP_WORKLOAD_TIMEOUT_SECONDS="${2:?missing startup workload timeout}"
+      shift 2
+      ;;
+    --startup-stability-window)
+      STARTUP_STABILITY_WINDOW_SECONDS="${2:?missing startup stability window}"
+      shift 2
+      ;;
+    --startup-stability-timeout)
+      STARTUP_STABILITY_TIMEOUT_SECONDS="${2:?missing startup stability timeout}"
+      shift 2
+      ;;
     --drain-timeout)
       DRAIN_TIMEOUT_SECONDS="${2:?missing drain timeout}"
       shift 2
@@ -205,6 +299,19 @@ while [[ $# -gt 0 ]]; do
   esac
 done
 
+case "${SHUTDOWN_MODE}" in
+  host-poweroff|cluster-only) ;;
+  *)
+    echo "Invalid --shutdown-mode '${SHUTDOWN_MODE}'. Expected host-poweroff or cluster-only." >&2
+    exit 1
+    ;;
+esac
+
+if [[ -n "${FORCE_FLUX_URL}" && "${ALLOW_FLUX_SOURCE_MUTATION}" -ne 1 ]]; then
+  echo "--force-flux-url requires --allow-flux-source-mutation (breakglass)." >&2
+  exit 1
+fi
+
 require_cmd() {
   local cmd="$1"
   if ! command -v "${cmd}" >/dev/null 2>&1; then
@@ -363,12 +470,337 @@ report_flux_source_state() {
   [[ -n "${flux_url}" ]] && log "flux-source-url=${flux_url}"
   if [[ -n "${flux_branch}" ]]; then
     log "flux-source-branch=${flux_branch}"
-    if [[ "${MODE}" == "startup" && -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then
-      warn "Flux source branch is '${flux_branch}'. Expected '${EXPECTED_FLUX_BRANCH}' for canonical recovery."
-    fi
   fi
 }
 
+csv_has_value() {
+  local csv="$1"
+  local value="$2"
+  local needle=",${value},"
+  local haystack=",${csv},"
+  [[ "${haystack}" == *"${needle}"* ]]
+}
+
+assert_flux_source_expected() {
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: skipping strict Flux source drift guard"
+    return 0
+  fi
+  local flux_url flux_branch
+  flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)"
+  flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)"
+  [[ -n "${flux_url}" ]] || die "Unable to read Flux source URL from flux-system/gitrepository."
+  [[ -n "${flux_branch}" ]] || die "Unable to read Flux source branch from flux-system/gitrepository."
+
+  if [[ -n "${EXPECTED_FLUX_URL}" && "${flux_url}" != "${EXPECTED_FLUX_URL}" ]]; then
+    die "Flux source URL drift detected: got '${flux_url}', expected '${EXPECTED_FLUX_URL}'. Refusing startup."
+  fi
+  if [[ -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then
+    die "Flux source branch drift detected: got '${flux_branch}', expected '${EXPECTED_FLUX_BRANCH}'. Use --force-flux-branch to correct."
+  fi
+}
+
+kustomization_is_optional() {
+  local name="$1"
+  [[ -n "${STARTUP_OPTIONAL_KUSTOMIZATIONS}" ]] || return 1
+  csv_has_value "${STARTUP_OPTIONAL_KUSTOMIZATIONS}" "${name}"
+}
+
+list_not_ready_kustomizations() {
+  local rows line name ready message
+  rows="$(kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io \
+    -o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,MESSAGE:.status.conditions[?(@.type=="Ready")].message' \
+    --no-headers 2>/dev/null || true)"
+  [[ -n "${rows}" ]] || return 0
+  while IFS= read -r line; do
+    [[ -n "${line}" ]] || continue
+    name="$(awk '{print $1}' <<< "${line}")"
+    ready="$(awk '{print $2}' <<< "${line}")"
+    message="${line#${name} }"
+    message="${message#${ready} }"
+    if kustomization_is_optional "${name}"; then
+      continue
+    fi
+    if [[ "${ready}" != "True" ]]; then
+      printf '%s|%s\n' "${name}" "${message}"
+    fi
+  done <<< "${rows}"
+}
+
+trigger_flux_reconcile_all() {
+  local now
+  now="$(date --iso-8601=seconds)"
+  run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="${now}" --overwrite
+  if command -v flux >/dev/null 2>&1; then
+    run flux reconcile source git flux-system -n flux-system --timeout=3m
+  fi
+}
+
+heal_failed_flux_jobs() {
+  local rows line ns name failed flux_owner helm_owner healed
+  healed=0
+  rows="$(kubectl get jobs.batch -A \
+    -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,FAILED:.status.failed,FLUX_OWNER:.metadata.labels.kustomize\\.toolkit\\.fluxcd\\.io/name,HELM_OWNER:.metadata.labels.helm\\.toolkit\\.fluxcd\\.io/name \
+    --no-headers 2>/dev/null || true)"
+  [[ -n "${rows}" ]] || return 1
+
+  while IFS= read -r line; do
+    [[ -n "${line}" ]] || continue
+    ns="$(awk '{print $1}' <<< "${line}")"
+    name="$(awk '{print $2}' <<< "${line}")"
+    failed="$(awk '{print $3}' <<< "${line}")"
+    flux_owner="$(awk '{print $4}' <<< "${line}")"
+    helm_owner="$(awk '{print $5}' <<< "${line}")"
+    [[ "${failed}" != "<none>" ]] || continue
+    [[ "${failed}" =~ ^[0-9]+$ ]] || continue
+    (( failed > 0 )) || continue
+    if [[ "${flux_owner}" == "<none>" && "${helm_owner}" == "<none>" ]]; then
+      continue
+    fi
+    warn "Deleting failed Flux-managed Job ${ns}/${name} to heal immutable-template drift."
+    run kubectl -n "${ns}" delete job "${name}" --ignore-not-found
+    healed=1
+  done <<< "${rows}"
+
+  (( healed == 1 ))
+}
+
+wait_for_flux_kustomizations_ready() {
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: skipping wait for all Flux kustomizations Ready"
+    return 0
+  fi
+  local start now not_ready immutable_hits
+  start="$(date +%s)"
+  immutable_hits=0
+  while true; do
+    not_ready="$(list_not_ready_kustomizations || true)"
+    if [[ -z "${not_ready}" ]]; then
+      log "flux-kustomizations=all-ready"
+      return 0
+    fi
+
+    log "flux-kustomizations-not-ready:"
+    while IFS= read -r line; do
+      [[ -n "${line}" ]] || continue
+      log "  ${line}"
+    done <<< "${not_ready}"
+
+    if grep -Eqi 'immutable|field is immutable|cannot patch.*Job|Job.*invalid' <<< "${not_ready}"; then
+      if (( immutable_hits < 3 )); then
+        immutable_hits=$(( immutable_hits + 1 ))
+        warn "Detected immutable Job failure signal in Flux status. Attempting automated Job cleanup (${immutable_hits}/3)."
+        if heal_failed_flux_jobs; then
+          trigger_flux_reconcile_all
+        fi
+      fi
+    fi
+
+    now="$(date +%s)"
+    if (( now - start >= FLUX_READY_TIMEOUT_SECONDS )); then
+      die "Timed out waiting for Flux kustomizations Ready after ${FLUX_READY_TIMEOUT_SECONDS}s."
+    fi
+    sleep "${FLUX_READY_POLL_SECONDS}"
+  done
+}
+
+default_startup_service_checklist() {
+  cat <<'CHECKS'
+gitea|https://scm.bstein.dev/api/healthz|200|"status":"pass"||
+grafana|https://metrics.bstein.dev/api/health|200|"database":"ok"||
+harbor|https://registry.bstein.dev/v2/|200,401|||
+CHECKS
+}
+
+list_ingress_hosts() {
+  kubectl get ingress -A -o jsonpath='{range .items[*]}{range .spec.rules[*]}{.host}{"\n"}{end}{end}' 2>/dev/null \
+    | sed '/^[[:space:]]*$/d' \
+    | sort -u
+}
+
+generated_ingress_service_checks() {
+  local host
+  while IFS= read -r host; do
+    [[ -n "${host}" ]] || continue
+    if [[ -n "${STARTUP_IGNORE_INGRESS_HOSTS_REGEX}" ]] && [[ "${host}" =~ ${STARTUP_IGNORE_INGRESS_HOSTS_REGEX} ]]; then
+      continue
+    fi
+    printf 'ingress-%s|https://%s/|%s|||0|%s\n' "${host}" "${host}" "${STARTUP_INGRESS_ALLOWED_STATUSES}" "${STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS}"
+  done < <(list_ingress_hosts)
+}
+
+startup_service_checklist_rows() {
+  local base
+  if [[ -n "${STARTUP_SERVICE_CHECKLIST}" ]]; then
+    base="$(printf '%s' "${STARTUP_SERVICE_CHECKLIST}" | tr ';' '\n')"
+  else
+    base="$(default_startup_service_checklist)"
+  fi
+
+  printf '%s\n' "${base}" | sed '/^[[:space:]]*$/d'
+  if [[ "${STARTUP_INCLUDE_INGRESS_CHECKS}" == "1" || "${STARTUP_INCLUDE_INGRESS_CHECKS}" == "true" ]]; then
+    generated_ingress_service_checks
+  fi
+}
+
+service_status_allowed() {
+  local expected_csv="$1"
+  local got="$2"
+  local token
+  IFS=',' read -r -a _statuses <<< "${expected_csv}"
+  for token in "${_statuses[@]}"; do
+    if [[ "${token}" == "${got}" ]]; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+check_startup_service_checklist_once() {
+  local rows row name url expected body_must body_must_not insecure timeout code rc
+  local body_file failures
+  failures=0
+  rows="$(startup_service_checklist_rows)"
+  while IFS= read -r row; do
+    [[ -n "${row}" ]] || continue
+    IFS='|' read -r name url expected body_must body_must_not insecure timeout <<< "${row}"
+    [[ -n "${name}" && -n "${url}" && -n "${expected}" ]] || continue
+    [[ -n "${insecure}" ]] || insecure=0
+    [[ -n "${timeout}" ]] || timeout="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS}"
+    body_file="$(mktemp)"
+    rc=0
+    if [[ "${insecure}" == "1" || "${insecure}" == "true" ]]; then
+      code="$(curl -ksS --max-time "${timeout}" -o "${body_file}" -w '%{http_code}' "${url}" || rc=$?)"
+    else
+      code="$(curl -sS --max-time "${timeout}" -o "${body_file}" -w '%{http_code}' "${url}" || rc=$?)"
+    fi
+    if (( rc != 0 )); then
+      warn "startup-check ${name}: request failed (rc=${rc}) url=${url}"
+      failures=1
+      rm -f "${body_file}"
+      continue
+    fi
+    if ! service_status_allowed "${expected}" "${code}"; then
+      warn "startup-check ${name}: expected status ${expected}, got ${code} url=${url}"
+      failures=1
+      rm -f "${body_file}"
+      continue
+    fi
+    if [[ -n "${body_must}" ]] && ! grep -Fq -- "${body_must}" "${body_file}"; then
+      warn "startup-check ${name}: missing required body fragment '${body_must}'"
+      failures=1
+      rm -f "${body_file}"
+      continue
+    fi
+    if [[ -n "${body_must_not}" ]] && grep -Fq -- "${body_must_not}" "${body_file}"; then
+      warn "startup-check ${name}: forbidden body fragment '${body_must_not}' present"
+      failures=1
+      rm -f "${body_file}"
+      continue
+    fi
+    rm -f "${body_file}"
+  done <<< "${rows}"
+  (( failures == 0 ))
+}
+
+wait_for_startup_service_checklist() {
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: skipping startup external service checklist wait"
+    return 0
+  fi
+  local start now checklist_ok workloads_ok
+  start="$(date +%s)"
+  while true; do
+    checklist_ok=0
+    workloads_ok=0
+    if check_startup_service_checklist_once; then
+      checklist_ok=1
+    fi
+    if list_unhealthy_workloads | sed '/^[[:space:]]*$/d' | grep -q .; then
+      workloads_ok=0
+    else
+      workloads_ok=1
+    fi
+    if (( checklist_ok == 1 && workloads_ok == 1 )); then
+      log "startup-checklist=all-passed"
+      return 0
+    fi
+    if (( workloads_ok == 0 )); then
+      warn "startup-checklist: workloads are not fully ready yet."
+    fi
+    now="$(date +%s)"
+    if (( now - start >= STARTUP_CHECKLIST_TIMEOUT_SECONDS )); then
+      die "Timed out waiting for startup external checklist after ${STARTUP_CHECKLIST_TIMEOUT_SECONDS}s."
+    fi
+    sleep "${STARTUP_CHECKLIST_POLL_SECONDS}"
+  done
+}
+
+collect_unstable_pods() {
+  local rows
+  rows="$(kubectl get pods -A --no-headers 2>/dev/null \
+    | awk '$4 ~ /(CrashLoopBackOff|ImagePullBackOff|ErrImagePull|CreateContainerConfigError|RunContainerError|InvalidImageName)/ {print $1 "/" $2 "|" $4}' || true)"
+  if [[ -n "${STARTUP_IGNORE_PODS_REGEX}" ]]; then
+    rows="$(printf '%s\n' "${rows}" | grep -Ev "${STARTUP_IGNORE_PODS_REGEX}" || true)"
+  fi
+  printf '%s' "${rows}"
+}
+
+wait_for_startup_stability_window() {
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: skipping startup stability window"
+    return 0
+  fi
+  local hard_deadline stable_since now unstable pods not_ready unhealthy_workloads
+  stable_since="$(date +%s)"
+  hard_deadline=$(( stable_since + STARTUP_STABILITY_TIMEOUT_SECONDS ))
+  while true; do
+    unstable=0
+    not_ready="$(list_not_ready_kustomizations || true)"
+    if [[ -n "${not_ready}" ]]; then
+      unstable=1
+      warn "stability-window: Flux kustomizations not ready."
+    fi
+    pods="$(collect_unstable_pods || true)"
+    if [[ -n "${pods}" ]]; then
+      unstable=1
+      warn "stability-window: unstable pods detected."
+      while IFS= read -r line; do
+        [[ -n "${line}" ]] || continue
+        warn "  ${line}"
+      done <<< "${pods}"
+    fi
+    if ! check_startup_service_checklist_once; then
+      unstable=1
+      warn "stability-window: external service checklist failed."
+    fi
+    unhealthy_workloads="$(list_unhealthy_workloads || true)"
+    if [[ -n "${unhealthy_workloads}" ]]; then
+      unstable=1
+      warn "stability-window: workloads not fully ready."
+      while IFS= read -r line; do
+        [[ -n "${line}" ]] || continue
+        warn "  ${line}"
+      done <<< "${unhealthy_workloads}"
+    fi
+
+    now="$(date +%s)"
+    if (( unstable == 0 )); then
+      if (( now - stable_since >= STARTUP_STABILITY_WINDOW_SECONDS )); then
+        log "startup-stability-window=passed (${STARTUP_STABILITY_WINDOW_SECONDS}s)"
+        return 0
+      fi
+    else
+      stable_since="${now}"
+    fi
+
+    if (( now >= hard_deadline )); then
+      die "Timed out waiting for startup stability window (${STARTUP_STABILITY_WINDOW_SECONDS}s healthy) within ${STARTUP_STABILITY_TIMEOUT_SECONDS}s."
+    fi
+    sleep "${STARTUP_STABILITY_POLL_SECONDS}"
+  done
+}
+
 wait_for_api() {
   local attempts=$(( API_WAIT_TIMEOUT_SECONDS / 5 ))
   if (( attempts < 1 )); then
@@ -410,13 +842,22 @@ patch_flux_suspend_all() {
   done <<< "${hr_list}"
 }
 
+shutdown_namespace_excluded() {
+  local ns="$1"
+  [[ "${ns}" =~ ${SHUTDOWN_NAMESPACE_EXCLUDES_REGEX} ]]
+}
+
+startup_workload_namespace_excluded() {
+  local ns="$1"
+  [[ "${ns}" =~ ${STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX} ]]
+}
+
 best_effort_scale_down_apps() {
-  local excludes='^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$'
-  local ns_list
+  local ns_list ns
   ns_list="$(kubectl get ns -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')"
   while IFS= read -r ns; do
     [[ -z "${ns}" ]] && continue
-    if [[ "${ns}" =~ ${excludes} ]]; then
+    if shutdown_namespace_excluded "${ns}"; then
       continue
     fi
     run_shell "kubectl -n ${ns} scale deployment --all --replicas=0 || true"
@@ -424,14 +865,186 @@ best_effort_scale_down_apps() {
   done <<< "${ns_list}"
 }
 
+save_workload_replica_snapshot() {
+  local rows line ns kind name replicas
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: save workload replica snapshot to ${REPLICA_SNAPSHOT_FILE}"
+    return 0
+  fi
+  rows="$(
+    {
+      kubectl get deployment -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\tdeployment\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\n"}{end}' 2>/dev/null || true
+      kubectl get statefulset -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\tstatefulset\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\n"}{end}' 2>/dev/null || true
+    } | sed '/^[[:space:]]*$/d'
+  )"
+  mkdir -p "$(dirname "${REPLICA_SNAPSHOT_FILE}")"
+  : > "${REPLICA_SNAPSHOT_FILE}"
+  while IFS=$'\t' read -r ns kind name replicas; do
+    [[ -n "${ns}" && -n "${kind}" && -n "${name}" && -n "${replicas}" ]] || continue
+    shutdown_namespace_excluded "${ns}" && continue
+    [[ "${replicas}" =~ ^[0-9]+$ ]] || continue
+    (( replicas > 0 )) || continue
+    printf '%s\t%s\t%s\t%s\n' "${ns}" "${kind}" "${name}" "${replicas}" >> "${REPLICA_SNAPSHOT_FILE}"
+  done <<< "${rows}"
+  log "replica-snapshot-file=${REPLICA_SNAPSHOT_FILE}"
+  log "replica-snapshot-count=$(wc -l < "${REPLICA_SNAPSHOT_FILE}" | tr -d ' ')"
+}
+
+restore_workload_replica_snapshot() {
+  local ns kind name desired current
+  if [[ "${RECOVERY_PENDING}" -ne 1 ]]; then
+    log "Skipping replica restore because recovery_pending=0."
+    return 0
+  fi
+  if [[ ! -f "${REPLICA_SNAPSHOT_FILE}" ]]; then
+    warn "Replica snapshot file not found at ${REPLICA_SNAPSHOT_FILE}; skipping replica restore."
+    return 0
+  fi
+  while IFS=$'\t' read -r ns kind name desired; do
+    [[ -n "${ns}" && -n "${kind}" && -n "${name}" && -n "${desired}" ]] || continue
+    [[ "${desired}" =~ ^[0-9]+$ ]] || continue
+    (( desired > 0 )) || continue
+    current="$(kubectl -n "${ns}" get "${kind}" "${name}" -o jsonpath='{.spec.replicas}' 2>/dev/null || true)"
+    [[ -n "${current}" ]] || continue
+    [[ "${current}" =~ ^[0-9]+$ ]] || current=0
+    if (( current == desired )); then
+      continue
+    fi
+    run kubectl -n "${ns}" scale "${kind}" "${name}" --replicas="${desired}"
+  done < "${REPLICA_SNAPSHOT_FILE}"
+  mark_checkpoint startup_replicas_restored
+}
+
+list_unhealthy_workloads() {
+  local rows line ns name desired ready available
+  rows="$(kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas --no-headers 2>/dev/null || true)"
+  while IFS= read -r line; do
+    [[ -n "${line}" ]] || continue
+    ns="$(awk '{print $1}' <<< "${line}")"
+    name="$(awk '{print $2}' <<< "${line}")"
+    desired="$(awk '{print $3}' <<< "${line}")"
+    ready="$(awk '{print $4}' <<< "${line}")"
+    available="$(awk '{print $5}' <<< "${line}")"
+    startup_workload_namespace_excluded "${ns}" && continue
+    [[ -n "${STARTUP_IGNORE_WORKLOADS_REGEX}" && "${ns}/${name}" =~ ${STARTUP_IGNORE_WORKLOADS_REGEX} ]] && continue
+    [[ "${desired}" =~ ^[0-9]+$ ]] || desired=0
+    [[ "${ready}" =~ ^[0-9]+$ ]] || ready=0
+    [[ "${available}" =~ ^[0-9]+$ ]] || available=0
+    (( desired > 0 )) || continue
+    if (( ready < desired || available < desired )); then
+      printf '%s/deployment/%s|ready=%s available=%s desired=%s\n' "${ns}" "${name}" "${ready}" "${available}" "${desired}"
+    fi
+  done <<< "${rows}"
+
+  rows="$(kubectl get statefulset -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas --no-headers 2>/dev/null || true)"
+  while IFS= read -r line; do
+    [[ -n "${line}" ]] || continue
+    ns="$(awk '{print $1}' <<< "${line}")"
+    name="$(awk '{print $2}' <<< "${line}")"
+    desired="$(awk '{print $3}' <<< "${line}")"
+    ready="$(awk '{print $4}' <<< "${line}")"
+    startup_workload_namespace_excluded "${ns}" && continue
+    [[ -n "${STARTUP_IGNORE_WORKLOADS_REGEX}" && "${ns}/${name}" =~ ${STARTUP_IGNORE_WORKLOADS_REGEX} ]] && continue
+    [[ "${desired}" =~ ^[0-9]+$ ]] || desired=0
+    [[ "${ready}" =~ ^[0-9]+$ ]] || ready=0
+    (( desired > 0 )) || continue
+    if (( ready < desired )); then
+      printf '%s/statefulset/%s|ready=%s desired=%s\n' "${ns}" "${name}" "${ready}" "${desired}"
+    fi
+  done <<< "${rows}"
+}
+
+wait_for_startup_workloads_ready() {
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: skipping startup workload readiness checks"
+    return 0
+  fi
+  local start now unhealthy
+  start="$(date +%s)"
+  while true; do
+    unhealthy="$(list_unhealthy_workloads || true)"
+    if [[ -z "${unhealthy}" ]]; then
+      log "startup-workloads=all-ready"
+      return 0
+    fi
+    warn "startup-workloads-not-ready:"
+    while IFS= read -r line; do
+      [[ -n "${line}" ]] || continue
+      warn "  ${line}"
+    done <<< "${unhealthy}"
+    now="$(date +%s)"
+    if (( now - start >= STARTUP_WORKLOAD_TIMEOUT_SECONDS )); then
+      die "Timed out waiting for startup workloads Ready after ${STARTUP_WORKLOAD_TIMEOUT_SECONDS}s."
+    fi
+    sleep "${STARTUP_WORKLOAD_POLL_SECONDS}"
+  done
+}
+
 discover_workers_csv() {
   kubectl get nodes \
-    -o custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\\.kubernetes\\.io/control-plane,MASTER:.metadata.labels.node-role\\.kubernetes\\.io/master \
+    -o 'custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\.kubernetes\.io/control-plane,MASTER:.metadata.labels.node-role\.kubernetes\.io/master,READY:.status.conditions[?(@.type=="Ready")].status' \
     --no-headers \
-    | awk '$2=="<none>" && $3=="<none>" {print $1}' \
+    | awk '$2=="<none>" && $3=="<none>" && $4=="True" {print $1}' \
     | paste -sd, -
 }
 
+node_is_ready() {
+  local node="$1"
+  [[ -n "${node}" ]] || return 1
+  local ready
+  ready="$(kubectl get node "${node}" -o jsonpath='{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || true)"
+  [[ "${ready}" == "True" ]]
+}
+
+select_ready_arm64_worker() {
+  local rows node
+  rows="$(kubectl get nodes -o 'custom-columns=NAME:.metadata.name,ARCH:.metadata.labels.kubernetes\.io/arch,WORKER:.metadata.labels.node-role\.kubernetes\.io/worker,HARDWARE:.metadata.labels.hardware,READY:.status.conditions[?(@.type=="Ready")].status' --no-headers 2>/dev/null || true)"
+  [[ -n "${rows}" ]] || return 1
+  node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $4=="rpi5" && $5=="True" {print $1; exit}')"
+  if [[ -n "${node}" ]]; then
+    printf '%s' "${node}"
+    return 0
+  fi
+  node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $4=="rpi4" && $5=="True" {print $1; exit}')"
+  if [[ -n "${node}" ]]; then
+    printf '%s' "${node}"
+    return 0
+  fi
+  node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $5=="True" {print $1; exit}')"
+  if [[ -n "${node}" ]]; then
+    printf '%s' "${node}"
+    return 0
+  fi
+  return 1
+}
+
+ensure_harbor_target_node() {
+  if node_is_ready "${HARBOR_TARGET_NODE}"; then
+    return 0
+  fi
+  local fallback
+  fallback="$(select_ready_arm64_worker || true)"
+  [[ -n "${fallback}" ]] || die "No Ready arm64 worker available for Harbor bootstrap target."
+  if [[ -n "${HARBOR_TARGET_NODE}" ]]; then
+    warn "Configured harbor target node '${HARBOR_TARGET_NODE}' is not Ready; using '${fallback}' instead."
+  else
+    log "harbor-target-node auto-selected: ${fallback}"
+  fi
+  HARBOR_TARGET_NODE="${fallback}"
+}
+
+ensure_harbor_host_label() {
+  [[ -n "${HARBOR_TARGET_NODE}" ]] || die "Harbor target node is not set."
+  local labeled node
+  labeled="$(kubectl get nodes -l "${HARBOR_HOST_LABEL_KEY}=true" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
+  while IFS= read -r node; do
+    [[ -z "${node}" ]] && continue
+    [[ "${node}" == "${HARBOR_TARGET_NODE}" ]] && continue
+    run kubectl label node "${node}" "${HARBOR_HOST_LABEL_KEY}-"
+  done <<< "${labeled}"
+  run kubectl label node "${HARBOR_TARGET_NODE}" "${HARBOR_HOST_LABEL_KEY}=true" --overwrite
+}
+
 as_array_from_csv() {
   local csv="$1"
   local out_var="$2"
@@ -557,9 +1170,18 @@ harbor_is_ready() {
 }
 
 run_harbor_pull_canary() {
-  local pod="hecate-harbor-canary"
+  local pod="ananke-harbor-canary"
+  local canary_node="${HARBOR_CANARY_NODE}"
+  if ! node_is_ready "${canary_node}"; then
+    ensure_harbor_target_node
+    canary_node="${HARBOR_TARGET_NODE}"
+    if [[ -n "${HARBOR_CANARY_NODE}" ]]; then
+      warn "Configured harbor canary node '${HARBOR_CANARY_NODE}' is not Ready; using '${canary_node}'."
+    fi
+    HARBOR_CANARY_NODE="${canary_node}"
+  fi
   if [[ "${EXECUTE}" -eq 0 ]]; then
-    log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${HARBOR_CANARY_NODE}"
+    log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${canary_node}"
     return 0
   fi
   timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
@@ -570,7 +1192,7 @@ metadata:
   name: ${pod}
   namespace: ${NODE_HELPER_NAMESPACE}
 spec:
-  nodeName: ${HARBOR_CANARY_NODE}
+  nodeName: ${canary_node}
   restartPolicy: Never
   imagePullSecrets:
     - name: ${REGISTRY_PULL_SECRET}
@@ -597,7 +1219,7 @@ run_helper_pod() {
   local purpose="$2"
   local timeout_seconds="$3"
   local script_content="$4"
-  local pod="hecate-$(sanitize_name "${purpose}")-$(date +%H%M%S)"
+  local pod="ananke-$(sanitize_name "${purpose}")-$(date +%H%M%S)"
   local encoded_script
   encoded_script="$(printf '%s' "${script_content}" | base64 -w0)"
 
@@ -631,9 +1253,9 @@ spec:
       command: ["/bin/bash", "-ceu"]
       args:
         - |
-          printf '%s' '${encoded_script}' | base64 -d >/tmp/hecate-step.sh
-          chmod +x /tmp/hecate-step.sh
-          /tmp/hecate-step.sh
+          printf '%s' '${encoded_script}' | base64 -d >/tmp/ananke-step.sh
+          chmod +x /tmp/ananke-step.sh
+          /tmp/ananke-step.sh
 POD
 
   if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded "${timeout_seconds}"; then
@@ -663,21 +1285,65 @@ SCRIPT
   run_helper_pod "${node}" "${purpose}" "${timeout_seconds}" "${script_content}"
 }
 
+run_host_command_via_prewarm_pod() {
+  local node="$1"
+  local host_command="$2"
+  local pod encoded_command
+  pod="$(kubectl -n "${NODE_HELPER_NAMESPACE}" get pods -l app="${NODE_HELPER_PREWARM_DS}" --field-selector "spec.nodeName=${node}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
+  if [[ -z "${pod}" ]]; then
+    return 1
+  fi
+  encoded_command="$(printf '%s' "${host_command}" | base64 -w0)"
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: helper exec via ${pod} on ${node}"
+    return 0
+  fi
+  run kubectl -n "${NODE_HELPER_NAMESPACE}" exec "${pod}" -- /bin/bash -ceu "HOST_COMMAND=\$(printf '%s' '${encoded_command}' | base64 -d); nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu \"\${HOST_COMMAND}\""
+}
+
 schedule_host_shutdown_via_helper() {
   local node="$1"
   local service_name="$2"
   local delay_seconds="$3"
   local host_command
-  host_command="/usr/bin/systemd-run --unit hecate-shutdown-${service_name} --on-active=${delay_seconds}s /bin/sh -lc '/usr/bin/systemctl stop ${service_name} || true; /usr/bin/systemctl poweroff || true'"
+  host_command="/usr/bin/systemd-run --unit ananke-shutdown-${service_name} --on-active=${delay_seconds}s /bin/sh -lc '/usr/bin/systemctl stop ${service_name} || true; /usr/bin/systemctl poweroff || true'"
+  if run_host_command_via_prewarm_pod "${node}" "${host_command}"; then
+    return 0
+  fi
   run_host_command_via_helper "${node}" "shutdown-${node}-${service_name}" 120 "${host_command}"
 }
 
+schedule_host_service_stop_via_helper() {
+  local node="$1"
+  local service_name="$2"
+  local delay_seconds="$3"
+  local host_command
+  host_command="/usr/bin/systemd-run --unit ananke-stop-${service_name} --on-active=${delay_seconds}s /bin/sh -lc '/usr/bin/systemctl stop ${service_name} || true'"
+  if run_host_command_via_prewarm_pod "${node}" "${host_command}"; then
+    return 0
+  fi
+  run_host_command_via_helper "${node}" "stop-${node}-${service_name}" 120 "${host_command}"
+}
+
 prewarm_node_helper_image() {
-  local name="hecate-node-helper-prewarm"
+  local name="${NODE_HELPER_PREWARM_DS}"
+  local ready_nodes node
+  local node_affinity_block=""
   if [[ "${EXECUTE}" -eq 0 ]]; then
     log "DRY-RUN: prewarm ${NODE_HELPER_IMAGE} via temporary DaemonSet"
     return 0
   fi
+  ready_nodes="$(kubectl get nodes -o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status' --no-headers 2>/dev/null | awk '$2=="True" {print $1}' || true)"
+  if [[ -n "${ready_nodes}" ]]; then
+    node_affinity_block=$'      affinity:\n        nodeAffinity:\n          requiredDuringSchedulingIgnoredDuringExecution:\n            nodeSelectorTerms:\n            - matchExpressions:\n              - key: kubernetes.io/hostname\n                operator: In\n                values:'
+    while IFS= read -r node; do
+      [[ -z "${node}" ]] && continue
+      node_affinity_block+=$'\n'"                - ${node}"
+    done <<< "${ready_nodes}"
+    log "node-helper-prewarm-targets=$(printf '%s' "${ready_nodes}" | paste -sd, -)"
+  else
+    warn "Unable to detect Ready nodes for prewarm targeting; continuing without node affinity."
+  fi
   cat <<DS | kubectl apply -f -
 apiVersion: apps/v1
 kind: DaemonSet
@@ -695,6 +1361,7 @@ spec:
     spec:
       imagePullSecrets:
         - name: ${REGISTRY_PULL_SECRET}
+${node_affinity_block}
       tolerations:
         - operator: Exists
       containers:
@@ -711,7 +1378,11 @@ DS
     [[ -n "${ready}" ]] || ready=0
     if [[ "${desired}" != "0" && "${desired}" == "${ready}" ]]; then
       log "node-helper-prewarm=${ready}/${desired}"
-      kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${name}" --ignore-not-found >/dev/null 2>&1 || true
+      if [[ "${KEEP_PREWARM_DAEMONSET}" -eq 0 ]]; then
+        kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${name}" --ignore-not-found >/dev/null 2>&1 || true
+      else
+        log "Keeping ${name} DaemonSet running for shutdown helper exec path."
+      fi
       return 0
     fi
     sleep 2
@@ -722,6 +1393,14 @@ DS
   die "Timed out prewarming node helper image ${NODE_HELPER_IMAGE}"
 }
 
+cleanup_prewarm_daemonset() {
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: cleanup ${NODE_HELPER_PREWARM_DS} DaemonSet"
+    return 0
+  fi
+  kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${NODE_HELPER_PREWARM_DS}" --ignore-not-found >/dev/null 2>&1 || true
+}
+
 start_bundle_server() {
   [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle not found at ${HARBOR_BUNDLE_FILE}"
   require_cmd python3
@@ -732,7 +1411,7 @@ start_bundle_server() {
     log "DRY-RUN: serve ${bundle_name} from ${bundle_dir} on port ${BUNDLE_HTTP_PORT}"
     return 0
   fi
-  python3 -m http.server "${BUNDLE_HTTP_PORT}" --bind 0.0.0.0 --directory "${bundle_dir}" </dev/null >/tmp/hecate-bundle-server.log 2>&1 &
+  python3 -m http.server "${BUNDLE_HTTP_PORT}" --bind 0.0.0.0 --directory "${bundle_dir}" </dev/null >/tmp/ananke-bundle-server.log 2>&1 &
   BUNDLE_SERVER_PID=$!
   for _ in $(seq 1 20); do
     if curl -fsS "http://127.0.0.1:${BUNDLE_HTTP_PORT}/${bundle_name}" >/dev/null 2>&1; then
@@ -740,7 +1419,7 @@ start_bundle_server() {
     fi
     sleep 1
   done
-  die "Temporary bundle server did not become ready; see /tmp/hecate-bundle-server.log"
+  die "Temporary bundle server did not become ready; see /tmp/ananke-bundle-server.log"
 }
 
 stop_bundle_server() {
@@ -762,6 +1441,8 @@ control_host_ip() {
 seed_harbor_images() {
   local images_text control_ip bundle_name script_content seed_rc=0
   [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle not found at ${HARBOR_BUNDLE_FILE}"
+  ensure_harbor_target_node
+  ensure_harbor_host_label
   images_text="$(sed '/^[[:space:]]*#/d;/^[[:space:]]*$/d' "${BOOTSTRAP_DIR}/harbor-bootstrap-images.txt")"
   [[ -n "${images_text}" ]] || die "No Harbor images listed in ${BOOTSTRAP_DIR}/harbor-bootstrap-images.txt"
   bundle_name="$(basename "${HARBOR_BUNDLE_FILE}")"
@@ -803,6 +1484,34 @@ bootstrap_local_harbor() {
   apply_kustomization services/harbor
 }
 
+reconcile_kustomization_with_self_heal() {
+  local item="$1"
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    run flux reconcile kustomization "${item}" -n flux-system --with-source --timeout=15m
+    return 0
+  fi
+  local attempt output rc
+  for attempt in 1 2; do
+    set +e
+    output="$(flux reconcile kustomization "${item}" -n flux-system --with-source --timeout=15m 2>&1)"
+    rc=$?
+    set -e
+    if (( rc == 0 )); then
+      [[ -n "${output}" ]] && printf '%s\n' "${output}"
+      return 0
+    fi
+    [[ -n "${output}" ]] && printf '%s\n' "${output}" >&2
+    if (( attempt == 1 )) && grep -Eqi 'immutable|field is immutable|cannot patch.*Job|Job.*invalid' <<< "${output}"; then
+      warn "Flux reconcile for '${item}' failed due immutable Job/template signal. Attempting self-heal."
+      heal_failed_flux_jobs || true
+      trigger_flux_reconcile_all || true
+      sleep 5
+      continue
+    fi
+    return "${rc}"
+  done
+}
+
 reconcile_stage() {
   local stage_name="$1"
   shift
@@ -814,7 +1523,7 @@ reconcile_stage() {
   fi
   local item
   for item in "$@"; do
-    run flux reconcile kustomization "${item}" -n flux-system --with-source --timeout=15m
+    reconcile_kustomization_with_self_heal "${item}"
   done
   mark_checkpoint "reconciled_${stage_name}"
 }
@@ -838,23 +1547,59 @@ resume_flux_and_reconcile() {
 }
 
 status_report() {
-  local battery flux_ready harbor_code workers
+  local battery flux_ready flux_url flux_branch flux_url_drift flux_branch_drift harbor_code workers ingress_hosts_count
+  local effective_target effective_canary
+  local labeled_nodes
   battery="$(read_ups_battery || true)"
   flux_ready="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
+  flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)"
+  flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)"
+  flux_url_drift=false
+  flux_branch_drift=false
+  if [[ -n "${EXPECTED_FLUX_URL}" && -n "${flux_url}" && "${flux_url}" != "${EXPECTED_FLUX_URL}" ]]; then
+    flux_url_drift=true
+  fi
+  if [[ -n "${EXPECTED_FLUX_BRANCH}" && -n "${flux_branch}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then
+    flux_branch_drift=true
+  fi
+  ingress_hosts_count="$(list_ingress_hosts | sed '/^[[:space:]]*$/d' | wc -l | tr -d ' ')"
   harbor_code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)"
   workers="$(discover_workers_csv 2>/dev/null || true)"
+  effective_target="${HARBOR_TARGET_NODE}"
+  if ! node_is_ready "${effective_target}"; then
+    effective_target="$(select_ready_arm64_worker || true)"
+  fi
+  effective_canary="${HARBOR_CANARY_NODE}"
+  if ! node_is_ready "${effective_canary}"; then
+    effective_canary="${effective_target}"
+  fi
   echo "mode=status"
+  echo "shutdown_mode=${SHUTDOWN_MODE}"
   echo "bundle_file=${HARBOR_BUNDLE_FILE}"
   echo "bundle_present=$([[ -f "${HARBOR_BUNDLE_FILE}" ]] && echo true || echo false)"
+  echo "replica_snapshot_file=${REPLICA_SNAPSHOT_FILE}"
+  echo "replica_snapshot_present=$([[ -f "${REPLICA_SNAPSHOT_FILE}" ]] && echo true || echo false)"
   echo "node_helper_image=${NODE_HELPER_IMAGE}"
-  echo "harbor_target_node=${HARBOR_TARGET_NODE}"
+  echo "harbor_target_node=${effective_target:-unknown}"
+  echo "harbor_canary_node=${effective_canary:-unknown}"
+  labeled_nodes="$(kubectl get nodes -l "${HARBOR_HOST_LABEL_KEY}=true" -o jsonpath='{range .items[*]}{.metadata.name}{","}{end}' 2>/dev/null || true)"
+  labeled_nodes="${labeled_nodes%,}"
+  echo "harbor_host_label_key=${HARBOR_HOST_LABEL_KEY}"
+  echo "harbor_host_label_nodes=${labeled_nodes:-none}"
   echo "workers=${workers}"
   echo "recovery_pending=${RECOVERY_PENDING}"
   echo "startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}"
   echo "last_checkpoint=${LAST_CHECKPOINT}"
   echo "ups_host=${UPS_HOST_IN_USE:-${UPS_HOST}}"
   echo "ups_battery=${battery:-unknown}"
+  echo "flux_source_expected_url=${EXPECTED_FLUX_URL}"
+  echo "flux_source_expected_branch=${EXPECTED_FLUX_BRANCH}"
+  echo "flux_source_actual_url=${flux_url:-unknown}"
+  echo "flux_source_actual_branch=${flux_branch:-unknown}"
+  echo "flux_source_url_drift=${flux_url_drift}"
+  echo "flux_source_branch_drift=${flux_branch_drift}"
   echo "flux_source_ready=${flux_ready:-unknown}"
+  echo "ingress_hosts_count=${ingress_hosts_count}"
   echo "harbor_http=${harbor_code:-unknown}"
   kubectl get ingressclass traefik >/dev/null 2>&1 && echo "traefik_ingressclass=true" || echo "traefik_ingressclass=false"
   kubectl -n traefik get deploy traefik >/dev/null 2>&1 && echo "traefik_deploy=true" || echo "traefik_deploy=false"
@@ -876,6 +1621,7 @@ planned_shutdown() {
   save_recovery_state 1 0 shutdown_started
 
   if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
+    KEEP_PREWARM_DAEMONSET=1
     prewarm_node_helper_image
     mark_checkpoint shutdown_helper_prewarmed
   fi
@@ -889,6 +1635,9 @@ planned_shutdown() {
     warn "Skipping etcd snapshot by request."
   fi
 
+  save_workload_replica_snapshot
+  mark_checkpoint shutdown_replicas_snapshot
+
   patch_flux_suspend_all true
   best_effort_scale_down_apps
   mark_checkpoint shutdown_apps_scaled_down
@@ -901,18 +1650,39 @@ planned_shutdown() {
   fi
 
   local node
+  if [[ "${SHUTDOWN_MODE}" == "cluster-only" ]]; then
+    warn "shutdown-mode=cluster-only: stopping k3s services only; host poweroff is disabled."
+  else
+    log "shutdown-mode=host-poweroff: scheduling host poweroff after service stop."
+  fi
+
   for node in "${WORKER_NODES[@]}"; do
     [[ -z "${node}" ]] && continue
-    schedule_host_shutdown_via_helper "${node}" k3s-agent 20
+    if [[ "${SHUTDOWN_MODE}" == "cluster-only" ]]; then
+      schedule_host_service_stop_via_helper "${node}" k3s-agent 20
+    else
+      schedule_host_shutdown_via_helper "${node}" k3s-agent 20
+    fi
   done
   mark_checkpoint shutdown_workers_scheduled
 
   for node in "${CONTROL_PLANE_NODES[@]}"; do
     [[ -z "${node}" ]] && continue
-    schedule_host_shutdown_via_helper "${node}" k3s 45
+    if [[ "${SHUTDOWN_MODE}" == "cluster-only" ]]; then
+      schedule_host_service_stop_via_helper "${node}" k3s 45
+    else
+      schedule_host_shutdown_via_helper "${node}" k3s 45
+    fi
   done
+  if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
+    cleanup_prewarm_daemonset
+  fi
   mark_checkpoint shutdown_control_planes_scheduled
-  log "Shutdown actions scheduled on hosts."
+  if [[ "${SHUTDOWN_MODE}" == "cluster-only" ]]; then
+    log "Cluster-only shutdown actions scheduled (hosts remain powered on)."
+  else
+    log "Shutdown + host poweroff actions scheduled on hosts."
+  fi
 }
 
 emergency_shutdown_after_outage() {
@@ -946,11 +1716,23 @@ startup_flow() {
   fi
   mark_checkpoint startup_api_ready
 
+  ensure_harbor_target_node
+  ensure_harbor_host_label
+  mark_checkpoint startup_harbor_host_labeled
+
+  if [[ -n "${FORCE_FLUX_URL}" ]]; then
+    warn "Breakglass: forcing Flux source URL to '${FORCE_FLUX_URL}'."
+    run kubectl -n flux-system patch gitrepository flux-system --type=merge -p "{\"spec\":{\"url\":\"${FORCE_FLUX_URL}\"}}"
+    mark_checkpoint startup_flux_url_forced
+  fi
+
   if [[ -n "${FORCE_FLUX_BRANCH}" ]]; then
     run kubectl -n flux-system patch gitrepository flux-system --type=merge -p "{\"spec\":{\"ref\":{\"branch\":\"${FORCE_FLUX_BRANCH}\"}}}"
     mark_checkpoint startup_flux_branch_forced
   fi
 
+  assert_flux_source_expected
+
   if [[ "${SKIP_LOCAL_BOOTSTRAP}" -eq 0 ]]; then
     if ! kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q True; then
       warn "Flux source not Ready; executing local bootstrap fallback path."
@@ -988,6 +1770,11 @@ startup_flow() {
   fi
 
   resume_flux_and_reconcile
+  wait_for_flux_kustomizations_ready
+  restore_workload_replica_snapshot
+  wait_for_startup_workloads_ready
+  wait_for_startup_service_checklist
+  wait_for_startup_stability_window
   if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
     prewarm_node_helper_image
     mark_checkpoint startup_helper_prewarmed
@@ -998,6 +1785,9 @@ startup_flow() {
 
 prepare_flow() {
   [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle missing at ${HARBOR_BUNDLE_FILE}. Build and copy it to the canonical control host first."
+  ensure_harbor_target_node
+  ensure_harbor_host_label
+  mark_checkpoint prepare_harbor_host_labeled
   if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
     prewarm_node_helper_image
     mark_checkpoint prepare_helper_prewarmed
@@ -1019,9 +1809,16 @@ harbor_seed_flow() {
 
 load_recovery_state
 log "mode=${MODE} execute=${EXECUTE}"
+log "shutdown-mode=${SHUTDOWN_MODE}"
 log "recovery-state-file=${RECOVERY_STATE_FILE}"
 log "bundle-file=${HARBOR_BUNDLE_FILE}"
 log "node-helper-image=${NODE_HELPER_IMAGE}"
+log "harbor-target-node-config=${HARBOR_TARGET_NODE:-auto}"
+log "harbor-canary-node-config=${HARBOR_CANARY_NODE:-auto}"
+log "harbor-host-label-key=${HARBOR_HOST_LABEL_KEY}"
+log "expected-flux-url=${EXPECTED_FLUX_URL}"
+log "expected-flux-branch=${EXPECTED_FLUX_BRANCH}"
+log "startup-optional-kustomizations=${STARTUP_OPTIONAL_KUSTOMIZATIONS:-none}"
 report_flux_source_state
 
 case "${MODE}" in