recovery: unblock harbor cold start and add power console

2026-04-06 00:22:54 -03:00 · 2026-04-06 00:22:54 -03:00 · 99bd68f61b
commit 99bd68f61b
parent a097c36718
11 changed files with 743 additions and 41 deletions
--- a/clusters/atlas/flux-system/applications/gitea/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/gitea/kustomization.yaml
@ -14,6 +14,7 @@ spec:
    name: flux-system
    namespace: flux-system
  dependsOn:
+    - name: longhorn
    - name: vault
    - name: postgres
  wait: true
--- a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml
@ -16,3 +16,6 @@ spec:
  wait: false
  dependsOn:
    - name: core
+    - name: longhorn
+    - name: vault
+    - name: postgres
--- a/clusters/atlas/flux-system/applications/keycloak/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/keycloak/kustomization.yaml
@ -13,6 +13,7 @@ spec:
  path: ./services/keycloak
  targetNamespace: sso
  dependsOn:
+    - name: longhorn
    - name: vault
    - name: postgres
  timeout: 2m
--- a/clusters/atlas/flux-system/applications/vault/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/vault/kustomization.yaml
@ -15,4 +15,5 @@ spec:
  prune: true
  wait: true
  dependsOn:
+    - name: longhorn
    - name: helm
--- a/clusters/atlas/flux-system/platform/postgres/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/postgres/kustomization.yaml
@ -14,6 +14,7 @@ spec:
    name: flux-system
  targetNamespace: postgres
  dependsOn:
+    - name: longhorn
    - name: vault
    - name: vault-csi
  healthChecks:
--- a/knowledge/runbooks/cluster-power-recovery.md
+++ b/knowledge/runbooks/cluster-power-recovery.md
@ -0,0 +1,66 @@
+Atlas Cluster Power Recovery (Graceful Shutdown/Startup)
+
+Purpose
+- Provide a safe operator flow for planned power events and cold-boot recovery.
+- Avoid the Flux/Gitea bootstrap deadlock by using a local bootstrap fallback path.
+- Refuse bootstrap when UPS charge is too low, and fall back to fast shutdown if a second outage hits mid-recovery.
+
+Bootstrapping risk to remember
+- Flux source is Git over SSH to `scm.bstein.dev` (Gitea).
+- Gitea itself is a Flux-managed workload and depends on storage + database.
+- Harbor is also critical, but it is not part of the first recovery stage because Harbor currently serves its own runtime images.
+- On cold boot, if Flux cannot fetch source before Gitea is up, reconciliation can stall.
+- Recovery path: bring control plane and workers up, then locally apply minimal platform stack (`core -> helm -> longhorn -> metallb -> traefik -> vault-csi -> vault-injector -> vault -> postgres -> gitea`), then resume/reconcile Flux. Harbor is a later recovery stage after storage, Vault, Postgres, and Gitea are back.
+
+Script
+- `scripts/cluster_power_recovery.sh`
+- `scripts/cluster_power_console.sh`
+- Modes:
+  - `shutdown`
+  - `startup`
+- Default is dry-run. Add `--execute` to actually perform actions.
+
+Dry-run examples
+- Shutdown preview:
+  - `scripts/cluster_power_recovery.sh shutdown --skip-etcd-snapshot --skip-drain`
+- Startup preview:
+  - `scripts/cluster_power_recovery.sh startup`
+
+Execute examples
+- Planned shutdown:
+  - `scripts/cluster_power_recovery.sh shutdown --execute`
+- Planned startup (canonical branch):
+  - `scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main`
+
+Manual remote console examples
+- From `titan-24` with a local checkout:
+  - `~/Development/titan-iac/scripts/cluster_power_console.sh shutdown --execute`
+  - `~/Development/titan-iac/scripts/cluster_power_console.sh startup --execute --force-flux-branch main`
+- From `titan-db`, if the checkout is not present locally, the console wrapper can delegate to `titan-24`:
+  - `~/Development/titan-iac/scripts/cluster_power_console.sh --delegate-host titan-24 shutdown --execute`
+  - `~/Development/titan-iac/scripts/cluster_power_console.sh --delegate-host titan-24 startup --execute --force-flux-branch main`
+
+Useful options
+- `--control-planes titan-0a,titan-0b,titan-0c`
+- `--workers <csv>` (otherwise the script tries API discovery first, then falls back to the static atlas worker inventory)
+- `--expected-flux-branch main`
+- `--force-flux-branch main`
+- `--skip-local-bootstrap` (not recommended for cold-start recovery)
+- `--skip-harbor-bootstrap` (skip the Harbor recovery stage if you know Harbor should stay deferred)
+- `--min-startup-battery 35`
+- `--ups-host ups@localhost`
+- `--require-ups-battery`
+- `--drain-timeout 180`
+- `--emergency-drain-timeout 45`
+- `--recovery-state-file ~/.local/state/cluster_power_recovery.state`
+
+Operational notes
+- The flow suspends Flux Kustomizations/HelmReleases during shutdown to prevent churn.
+- Worker drain is no longer best-effort only. The script now escalates from normal drain, to `--force`, to `--disable-eviction` once the configured timeout is exhausted.
+- During startup, if Flux source is not `Ready`, local bootstrap fallback is applied first.
+- Longhorn is reconciled before Vault/Postgres/Gitea so storage-backed services are not racing the volume layer.
+- Harbor is reconciled after the first critical stateful services. Treat Harbor bootstrap as requiring either cached Harbor runtime images on the scheduled node or a separate bootstrap source for those images.
+- The script persists outage state in `~/.local/state/cluster_power_recovery.state` by default. If startup is attempted during an outage window and power becomes unstable again, rerunning startup with insufficient UPS charge will flip into the emergency shutdown path instead of continuing to bootstrap.
+- In dry-run mode, the script now skips the live API wait step so preview runs do not stall on an offline cluster.
+- After bootstrap, Flux resources are resumed and reconciled.
+- Keep this runbook aligned with `clusters/atlas/flux-system/gotk-sync.yaml`.
--- a/scripts/cluster_power_console.sh
+++ b/scripts/cluster_power_console.sh
@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat <<'USAGE'
+Usage:
+  scripts/cluster_power_console.sh [--repo-dir <path>] [--delegate-host <host>] <shutdown|startup> [recovery-script-options...]
+
+Purpose:
+  Friendly manual entrypoint for running cluster power recovery from a remote console.
+  If the repo checkout exists locally, run the recovery script here.
+  Otherwise, delegate to another host that has the repo checkout.
+
+Defaults:
+  --repo-dir       $HOME/Development/titan-iac
+  --delegate-host  titan-24
+
+Examples:
+  scripts/cluster_power_console.sh shutdown --execute
+  scripts/cluster_power_console.sh startup --execute --force-flux-branch main
+  scripts/cluster_power_console.sh --delegate-host titan-24 shutdown --execute
+USAGE
+}
+
+REPO_DIR="${HOME}/Development/titan-iac"
+DELEGATE_HOST="titan-24"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --repo-dir)
+      REPO_DIR="${2:-}"
+      shift 2
+      ;;
+    --delegate-host)
+      DELEGATE_HOST="${2:-}"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      break
+      ;;
+  esac
+done
+
+if [[ $# -lt 1 ]]; then
+  usage
+  exit 1
+fi
+
+LOCAL_SCRIPT="${REPO_DIR}/scripts/cluster_power_recovery.sh"
+
+if [[ -x "${LOCAL_SCRIPT}" ]] && command -v kubectl >/dev/null 2>&1; then
+  exec "${LOCAL_SCRIPT}" "$@"
+fi
+
+if [[ -z "${DELEGATE_HOST}" ]]; then
+  echo "cluster-power-console: local repo checkout not found at ${REPO_DIR} and no delegate host configured" >&2
+  exit 1
+fi
+
+quoted_repo_dir="$(printf '%q' "${REPO_DIR}")"
+quoted_args="$(printf '%q ' "$@")"
+
+exec ssh -o BatchMode=yes -o ConnectTimeout=8 "${DELEGATE_HOST}" \
+  "cd ${quoted_repo_dir} && ./scripts/cluster_power_recovery.sh ${quoted_args}"
--- a/scripts/cluster_power_recovery.sh
+++ b/scripts/cluster_power_recovery.sh
@ -0,0 +1,561 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat <<'USAGE'
+Usage:
+  scripts/cluster_power_recovery.sh shutdown [options]
+  scripts/cluster_power_recovery.sh startup  [options]
+
+Options:
+  --execute                     Actually run commands (default is dry-run)
+  --ssh-user <user>             SSH user for node commands (default: current SSH config user)
+  --control-planes <csv>        Control plane hosts (default: titan-0a,titan-0b,titan-0c)
+  --workers <csv>               Worker hosts (default: static atlas inventory, with API discovery when available)
+  --expected-flux-branch <name> Expected Flux source branch during startup checks (default: main)
+  --skip-etcd-snapshot          Skip etcd snapshot before shutdown
+  --skip-drain                  Skip worker drain during shutdown
+  --skip-local-bootstrap        Startup: skip local bootstrap fallback applies
+  --skip-harbor-bootstrap       Startup: skip Harbor recovery bootstrap stage
+  --force-flux-branch <name>    Startup: patch flux-system GitRepository branch to this value
+  --min-startup-battery <pct>   Minimum UPS percent required before bootstrap (default: 35)
+  --ups-host <name>             UPS identifier for upsc (default: ups@localhost)
+  --ups-battery-key <key>       UPS battery key for upsc (default: battery.charge)
+  --recovery-state-file <path>  Recovery state file for second-outage detection
+  --drain-timeout <seconds>     Worker drain timeout for normal shutdown (default: 180)
+  --emergency-drain-timeout <seconds>
+                               Worker drain timeout for emergency fallback (default: 45)
+  --require-ups-battery         Hard-fail startup if UPS battery cannot be read
+  -h, --help                    Show help
+
+Examples:
+  scripts/cluster_power_recovery.sh shutdown --execute
+  scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main
+USAGE
+}
+
+MODE="${1:-}"
+if [[ -z "${MODE}" || "${MODE}" == "-h" || "${MODE}" == "--help" ]]; then
+  usage
+  exit 0
+fi
+shift || true
+
+if [[ "${MODE}" != "shutdown" && "${MODE}" != "startup" ]]; then
+  echo "Unknown mode: ${MODE}" >&2
+  usage
+  exit 1
+fi
+
+EXECUTE=0
+SSH_USER=""
+CONTROL_PLANES="titan-0a,titan-0b,titan-0c"
+WORKERS=""
+DEFAULT_WORKERS="titan-04,titan-05,titan-06,titan-07,titan-08,titan-09,titan-10,titan-11,titan-12,titan-13,titan-14,titan-15,titan-17,titan-18,titan-19,titan-20,titan-21,titan-22,titan-24"
+EXPECTED_FLUX_BRANCH="main"
+SKIP_ETCD_SNAPSHOT=0
+SKIP_DRAIN=0
+SKIP_LOCAL_BOOTSTRAP=0
+SKIP_HARBOR_BOOTSTRAP=0
+FORCE_FLUX_BRANCH=""
+UPS_HOST="ups@localhost"
+UPS_BATTERY_KEY="battery.charge"
+RECOVERY_STATE_FILE="${HOME}/.local/state/cluster_power_recovery.state"
+MIN_STARTUP_BATTERY=35
+DRAIN_TIMEOUT_SECONDS=180
+EMERGENCY_DRAIN_TIMEOUT_SECONDS=45
+REQUIRE_UPS_BATTERY=0
+
+RECOVERY_PENDING=0
+STARTUP_ATTEMPTED_DURING_OUTAGE=0
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --execute)
+      EXECUTE=1
+      shift
+      ;;
+    --ssh-user)
+      SSH_USER="${2:-}"
+      shift 2
+      ;;
+    --control-planes)
+      CONTROL_PLANES="${2:-}"
+      shift 2
+      ;;
+    --workers)
+      WORKERS="${2:-}"
+      shift 2
+      ;;
+    --expected-flux-branch)
+      EXPECTED_FLUX_BRANCH="${2:-}"
+      shift 2
+      ;;
+    --skip-etcd-snapshot)
+      SKIP_ETCD_SNAPSHOT=1
+      shift
+      ;;
+    --skip-drain)
+      SKIP_DRAIN=1
+      shift
+      ;;
+    --skip-local-bootstrap)
+      SKIP_LOCAL_BOOTSTRAP=1
+      shift
+      ;;
+    --skip-harbor-bootstrap)
+      SKIP_HARBOR_BOOTSTRAP=1
+      shift
+      ;;
+    --force-flux-branch)
+      FORCE_FLUX_BRANCH="${2:-}"
+      shift 2
+      ;;
+    --ups-host)
+      UPS_HOST="${2:-}"
+      shift 2
+      ;;
+    --ups-battery-key)
+      UPS_BATTERY_KEY="${2:-}"
+      shift 2
+      ;;
+    --min-startup-battery)
+      MIN_STARTUP_BATTERY="${2:-}"
+      shift 2
+      ;;
+    --recovery-state-file)
+      RECOVERY_STATE_FILE="${2:-}"
+      shift 2
+      ;;
+    --drain-timeout)
+      DRAIN_TIMEOUT_SECONDS="${2:-}"
+      shift 2
+      ;;
+    --emergency-drain-timeout)
+      EMERGENCY_DRAIN_TIMEOUT_SECONDS="${2:-}"
+      shift 2
+      ;;
+    --require-ups-battery)
+      REQUIRE_UPS_BATTERY=1
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      exit 1
+      ;;
+  esac
+done
+
+require_cmd() {
+  local cmd="$1"
+  if ! command -v "${cmd}" >/dev/null 2>&1; then
+    echo "Missing required command: ${cmd}" >&2
+    exit 1
+  fi
+}
+
+require_cmd kubectl
+require_cmd ssh
+
+log() { echo "[cluster-power] $*"; }
+warn() { echo "[cluster-power][warn] $*" >&2; }
+
+run() {
+  if [[ "${EXECUTE}" -eq 1 ]]; then
+    log "EXEC: $*"
+    "$@"
+  else
+    log "DRY-RUN: $*"
+  fi
+}
+
+run_shell() {
+  if [[ "${EXECUTE}" -eq 1 ]]; then
+    log "EXEC: $*"
+    bash -lc "$*"
+  else
+    log "DRY-RUN: $*"
+  fi
+}
+
+as_array_from_csv() {
+  local csv="$1"
+  local out_var="$2"
+  local old_ifs="${IFS}"
+  IFS=',' read -r -a _tmp <<< "${csv}"
+  IFS="${old_ifs}"
+  eval "${out_var}"'=( "${_tmp[@]}" )'
+}
+
+ssh_target() {
+  local node="$1"
+  if [[ -n "${SSH_USER}" ]]; then
+    printf "%s@%s" "${SSH_USER}" "${node}"
+  else
+    printf "%s" "${node}"
+  fi
+}
+
+discover_workers_csv() {
+  # Include every non-control-plane node by default (workers + accelerators).
+  kubectl get nodes \
+    -o custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\\.kubernetes\\.io/control-plane,MASTER:.metadata.labels.node-role\\.kubernetes\\.io/master \
+    --no-headers \
+    | awk '$2=="<none>" && $3=="<none>" {print $1}' \
+  | paste -sd, -
+}
+
+load_recovery_state() {
+  if [[ ! -f "${RECOVERY_STATE_FILE}" ]]; then
+    RECOVERY_PENDING=0
+    STARTUP_ATTEMPTED_DURING_OUTAGE=0
+    return 0
+  fi
+
+  while IFS='=' read -r key value; do
+    case "${key}" in
+      recovery_pending)
+        RECOVERY_PENDING="${value}"
+        ;;
+      startup_attempted)
+        STARTUP_ATTEMPTED_DURING_OUTAGE="${value}"
+        ;;
+    esac
+  done < "${RECOVERY_STATE_FILE}"
+}
+
+save_recovery_state() {
+  mkdir -p "$(dirname "${RECOVERY_STATE_FILE}")"
+  cat > "${RECOVERY_STATE_FILE}" <<EOF
+recovery_pending=${1}
+startup_attempted=${2}
+EOF
+}
+
+clear_recovery_state() {
+  if [[ -f "${RECOVERY_STATE_FILE}" ]]; then
+    rm -f "${RECOVERY_STATE_FILE}"
+  fi
+}
+
+read_ups_battery() {
+  if ! command -v upsc >/dev/null 2>&1; then
+    return 1
+  fi
+  local raw
+  raw="$(upsc "${UPS_HOST}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)"
+  if [[ -z "${raw}" ]]; then
+    return 1
+  fi
+  # battery.charge can include units/decimals in some setups; normalize.
+  raw="${raw%%.*}"
+  if ! [[ "${raw}" =~ ^[0-9]+$ ]]; then
+    return 1
+  fi
+  echo "${raw}"
+}
+
+ensure_minimum_battery_for_bootstrap() {
+  local battery
+  battery="$(read_ups_battery || true)"
+  if [[ -z "${battery}" ]]; then
+    if [[ "${REQUIRE_UPS_BATTERY}" -eq 1 ]]; then
+      warn "Unable to read UPS battery status and --require-ups-battery is set."
+      return 1
+    fi
+    warn "Unable to read UPS battery status; continuing without hard battery gating."
+    return 0
+  fi
+
+  log "ups-battery=${battery}%"
+  if (( battery < MIN_STARTUP_BATTERY )); then
+    warn "UPS battery ${battery}% below minimum startup threshold ${MIN_STARTUP_BATTERY}%."
+    return 1
+  fi
+  return 0
+}
+
+emergency_shutdown_after_outage() {
+  warn "Entering outage-aware emergency shutdown path due insufficient startup budget."
+  patch_flux_suspend_all true || true
+  best_effort_scale_down_apps
+  # Give the cluster one short chance to drain, then force progress.
+  best_effort_drain_workers "${EMERGENCY_DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}"
+  stop_workers_agents "${WORKER_NODES[@]}"
+  stop_control_planes "${CONTROL_PLANE_NODES[@]}"
+}
+
+patch_flux_suspend_all() {
+  local value="$1"
+  local patch
+  patch=$(printf '{"spec":{"suspend":%s}}' "${value}")
+
+  local ks_list hr_list
+  ks_list="$(kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' || true)"
+  hr_list="$(kubectl get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' || true)"
+
+  while IFS= read -r k; do
+    [[ -z "${k}" ]] && continue
+    run kubectl -n flux-system patch kustomization "${k}" --type=merge -p "${patch}"
+  done <<< "${ks_list}"
+
+  while IFS= read -r hr; do
+    [[ -z "${hr}" ]] && continue
+    local ns="${hr%%/*}"
+    local name="${hr##*/}"
+    run kubectl -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}"
+  done <<< "${hr_list}"
+}
+
+report_flux_source_state() {
+  local flux_url flux_branch
+  flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)"
+  flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)"
+
+  if [[ -n "${flux_url}" ]]; then
+    log "flux-source-url=${flux_url}"
+  fi
+  if [[ -n "${flux_branch}" ]]; then
+    log "flux-source-branch=${flux_branch}"
+    if [[ "${MODE}" == "startup" && -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then
+      warn "Flux source branch is '${flux_branch}'. Expected '${EXPECTED_FLUX_BRANCH}' for canonical cold-start recovery. Use --force-flux-branch ${EXPECTED_FLUX_BRANCH} if needed."
+    fi
+  fi
+}
+
+wait_for_api() {
+  local attempts="${1:-90}"
+  local sleep_s="${2:-2}"
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "DRY-RUN: skipping live Kubernetes API wait"
+    return 0
+  fi
+  local i
+  for i in $(seq 1 "${attempts}"); do
+    if kubectl version --request-timeout=5s >/dev/null 2>&1; then
+      return 0
+    fi
+    sleep "${sleep_s}"
+  done
+  return 1
+}
+
+best_effort_scale_down_apps() {
+  local excludes='^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$'
+  local ns_list
+  ns_list="$(kubectl get ns -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')"
+  while IFS= read -r ns; do
+    [[ -z "${ns}" ]] && continue
+    if [[ "${ns}" =~ ${excludes} ]]; then
+      continue
+    fi
+    run_shell "kubectl -n ${ns} scale deployment --all --replicas=0 || true"
+    run_shell "kubectl -n ${ns} scale statefulset --all --replicas=0 || true"
+  done <<< "${ns_list}"
+}
+
+best_effort_drain_workers() {
+  local timeout_seconds="$1"
+  shift || true
+  local workers=("$@")
+  local node
+  for node in "${workers[@]}"; do
+    [[ -z "${node}" ]] && continue
+    run kubectl cordon "${node}"
+    if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s"; then
+      continue
+    fi
+    warn "Gentle drain timed out for ${node}; retrying with --force."
+    if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force"; then
+      continue
+    fi
+    warn "Force drain timed out for ${node}; final attempt with --disable-eviction."
+    run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force --disable-eviction || true"
+  done
+}
+
+stop_workers_agents() {
+  local workers=("$@")
+  local node target
+  for node in "${workers[@]}"; do
+    [[ -z "${node}" ]] && continue
+    target="$(ssh_target "${node}")"
+    run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl stop k3s-agent || true"
+  done
+}
+
+start_workers_agents() {
+  local workers=("$@")
+  local node target
+  for node in "${workers[@]}"; do
+    [[ -z "${node}" ]] && continue
+    target="$(ssh_target "${node}")"
+    run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl start k3s-agent || true"
+  done
+}
+
+stop_control_planes() {
+  local cps=("$@")
+  local node target
+  for node in "${cps[@]}"; do
+    [[ -z "${node}" ]] && continue
+    target="$(ssh_target "${node}")"
+    run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl stop k3s || true"
+  done
+}
+
+start_control_planes() {
+  local cps=("$@")
+  local node target
+  for node in "${cps[@]}"; do
+    [[ -z "${node}" ]] && continue
+    target="$(ssh_target "${node}")"
+    run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl start k3s || true"
+  done
+}
+
+take_etcd_snapshot() {
+  local cp="$1"
+  local target
+  target="$(ssh_target "${cp}")"
+  local ts
+  ts="$(date +%Y%m%d-%H%M%S)"
+  run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" \
+    "sudo k3s etcd-snapshot save --name pre-shutdown-${ts}"
+}
+
+bootstrap_local_minimal() {
+  # Local apply path to break Flux<->Gitea boot deadlock during cold-start recovery.
+  # Longhorn is applied before stateful workloads so astreae-backed PVCs can bind.
+  run kubectl apply -k infrastructure/core
+  run kubectl apply -k infrastructure/sources/helm
+  run kubectl apply -k infrastructure/longhorn/core
+  run kubectl apply -k infrastructure/metallb
+  run kubectl apply -k infrastructure/traefik
+  run kubectl apply -k infrastructure/vault-csi
+  run kubectl apply -k infrastructure/vault-injector
+  run kubectl apply -k services/vault
+  run kubectl apply -k infrastructure/postgres
+  run kubectl apply -k services/gitea
+}
+
+bootstrap_local_harbor() {
+  # Optional Harbor bootstrap stage for environments where Harbor is authoritative for images.
+  run kubectl apply -k services/harbor
+}
+
+resume_flux_and_reconcile() {
+  patch_flux_suspend_all false
+
+  if command -v flux >/dev/null 2>&1; then
+    run flux reconcile source git flux-system -n flux-system --timeout=3m
+    run flux reconcile kustomization core -n flux-system --with-source --timeout=5m
+    run flux reconcile kustomization helm -n flux-system --with-source --timeout=5m
+    run flux reconcile kustomization longhorn -n flux-system --with-source --timeout=15m
+    run flux reconcile kustomization metallb -n flux-system --with-source --timeout=5m
+    run flux reconcile kustomization traefik -n flux-system --with-source --timeout=5m
+    run flux reconcile kustomization vault-csi -n flux-system --with-source --timeout=5m
+    run flux reconcile kustomization vault-injector -n flux-system --with-source --timeout=5m
+    run flux reconcile kustomization vault -n flux-system --with-source --timeout=10m
+    run flux reconcile kustomization postgres -n flux-system --with-source --timeout=10m
+    run flux reconcile kustomization gitea -n flux-system --with-source --timeout=10m
+    run flux reconcile kustomization harbor -n flux-system --with-source --timeout=15m
+  else
+    local now
+    now="$(date --iso-8601=seconds)"
+    run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="${now}" --overwrite
+  fi
+}
+
+as_array_from_csv "${CONTROL_PLANES}" CONTROL_PLANE_NODES
+if [[ -z "${WORKERS}" ]]; then
+  WORKERS="$(discover_workers_csv 2>/dev/null || true)"
+  if [[ -z "${WORKERS}" ]]; then
+    warn "Unable to auto-discover workers from the API; falling back to static atlas worker inventory."
+    WORKERS="${DEFAULT_WORKERS}"
+  fi
+fi
+as_array_from_csv "${WORKERS}" WORKER_NODES
+load_recovery_state
+
+log "mode=${MODE} execute=${EXECUTE}"
+log "control-planes=${CONTROL_PLANES}"
+log "workers=${WORKERS}"
+log "recovery-state-file=${RECOVERY_STATE_FILE}"
+log "recovery_pending=${RECOVERY_PENDING} startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}"
+report_flux_source_state
+
+if [[ "${MODE}" == "shutdown" ]]; then
+  save_recovery_state 1 0
+  if [[ "${SKIP_ETCD_SNAPSHOT}" -eq 0 ]]; then
+    take_etcd_snapshot "${CONTROL_PLANE_NODES[0]}"
+  else
+    warn "Skipping etcd snapshot by request."
+  fi
+
+  patch_flux_suspend_all true
+  best_effort_scale_down_apps
+
+  if [[ "${SKIP_DRAIN}" -eq 0 ]]; then
+    best_effort_drain_workers "${DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}"
+  else
+    warn "Skipping worker drain by request."
+  fi
+
+  stop_workers_agents "${WORKER_NODES[@]}"
+  stop_control_planes "${CONTROL_PLANE_NODES[@]}"
+
+  log "Shutdown flow complete."
+  exit 0
+fi
+
+# Startup mode
+if [[ "${RECOVERY_PENDING}" -eq 1 ]]; then
+  if ! ensure_minimum_battery_for_bootstrap; then
+    if [[ "${STARTUP_ATTEMPTED_DURING_OUTAGE}" -eq 1 ]]; then
+      emergency_shutdown_after_outage
+      exit 1
+    fi
+    warn "Startup deferred due low battery after recent outage; marking for second-outage fallback."
+    save_recovery_state 1 1
+    exit 1
+  fi
+  save_recovery_state 1 1
+fi
+
+start_control_planes "${CONTROL_PLANE_NODES[@]}"
+start_workers_agents "${WORKER_NODES[@]}"
+
+if ! wait_for_api 120 2; then
+  warn "Kubernetes API did not become reachable in time."
+  exit 1
+fi
+
+if [[ -n "${FORCE_FLUX_BRANCH}" ]]; then
+  run kubectl -n flux-system patch gitrepository flux-system --type=merge \
+    -p "{\"spec\":{\"ref\":{\"branch\":\"${FORCE_FLUX_BRANCH}\"}}}"
+fi
+
+if [[ "${SKIP_LOCAL_BOOTSTRAP}" -eq 0 ]]; then
+  # If source is not ready, bootstrap critical pieces from local checkout first.
+  if ! kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q True; then
+    warn "Flux source not Ready; executing local bootstrap fallback path."
+    bootstrap_local_minimal
+    if [[ "${SKIP_HARBOR_BOOTSTRAP}" -eq 0 ]]; then
+      bootstrap_local_harbor
+    else
+      warn "Skipping Harbor bootstrap fallback by request."
+    fi
+  fi
+else
+  warn "Skipping local bootstrap fallback by request."
+fi
+
+resume_flux_and_reconcile
+clear_recovery_state
+log "Startup flow complete."
--- a/services/harbor/helmrelease.yaml
+++ b/services/harbor/helmrelease.yaml
@ -76,8 +76,8 @@ spec:
      type: internal
      internal:
        image:
-          repository: registry.bstein.dev/infra/harbor-redis
-          tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"}
+          repository: goharbor/redis-photon
+          tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-redis:tag"}
        nodeSelector:
          kubernetes.io/hostname: titan-05
        affinity:
@ -112,8 +112,8 @@ spec:
    existingSecretSecretKey: harbor-core
    core:
      image:
-        repository: registry.bstein.dev/infra/harbor-core
-        tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"}
+        repository: goharbor/harbor-core
+        tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-core:tag"}
      nodeSelector:
        kubernetes.io/hostname: titan-05
      serviceAccountName: harbor-vault-sync
@ -173,8 +173,8 @@ spec:
                    values: ["rpi4"]
    jobservice:
      image:
-        repository: registry.bstein.dev/infra/harbor-jobservice
-        tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"}
+        repository: goharbor/harbor-jobservice
+        tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-jobservice:tag"}
      nodeSelector:
        kubernetes.io/hostname: titan-05
      serviceAccountName: harbor-vault-sync
@ -215,8 +215,8 @@ spec:
                    values: ["rpi4"]
    portal:
      image:
-        repository: registry.bstein.dev/infra/harbor-portal
-        tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"}
+        repository: goharbor/harbor-portal
+        tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-portal:tag"}
      nodeSelector:
        kubernetes.io/hostname: titan-05
      affinity:
@ -243,8 +243,8 @@ spec:
    registry:
      registry:
        image:
-          repository: registry.bstein.dev/infra/harbor-registry
-          tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"}
+          repository: goharbor/registry-photon
+          tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-registry:tag"}
        extraEnvVars:
          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
            value: harbor-core
@ -258,8 +258,8 @@ spec:
            value: 1s
      controller:
        image:
-          repository: registry.bstein.dev/infra/harbor-registryctl
-          tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registryctl:tag"}
+          repository: goharbor/harbor-registryctl
+          tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-registryctl:tag"}
      serviceAccountName: harbor-vault-sync
      automountServiceAccountToken: true
      existingSecret: harbor-registry
@ -320,8 +320,8 @@ spec:
                    values: ["rpi4"]
    nginx:
      image:
-        repository: registry.bstein.dev/infra/harbor-nginx
-        tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"}
+        repository: goharbor/nginx-photon
+        tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-nginx:tag"}
      nodeSelector:
        kubernetes.io/hostname: titan-05
      affinity:
@ -347,8 +347,8 @@ spec:
                    values: ["rpi4"]
    prepare:
      image:
-        repository: registry.bstein.dev/infra/harbor-prepare
-        tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-prepare:tag"}
+        repository: goharbor/prepare
+        tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-prepare:tag"}
    updateStrategy:
      type: Recreate
  postRenderers:
--- a/services/harbor/image.yaml
+++ b/services/harbor/image.yaml
@ -5,7 +5,7 @@ metadata:
  name: harbor-core
  namespace: harbor
 spec:
-  image: registry.bstein.dev/infra/harbor-core
+  image: goharbor/harbor-core
  interval: 5m0s
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
@ -17,11 +17,11 @@ spec:
  imageRepositoryRef:
    name: harbor-core
  filterTags:
-    pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
+    pattern: '^v(?P<version>\d+\.\d+\.\d+)$'
    extract: '$version'
  policy:
    semver:
-      range: ">=2.14.0-0 <2.15.0-0"
+      range: ">=2.14.0 <2.15.0"
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImageRepository
@ -29,7 +29,7 @@ metadata:
  name: harbor-jobservice
  namespace: harbor
 spec:
-  image: registry.bstein.dev/infra/harbor-jobservice
+  image: goharbor/harbor-jobservice
  interval: 5m0s
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
@ -41,11 +41,11 @@ spec:
  imageRepositoryRef:
    name: harbor-jobservice
  filterTags:
-    pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
+    pattern: '^v(?P<version>\d+\.\d+\.\d+)$'
    extract: '$version'
  policy:
    semver:
-      range: ">=2.14.0-0 <2.15.0-0"
+      range: ">=2.14.0 <2.15.0"
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImageRepository
@ -53,7 +53,7 @@ metadata:
  name: harbor-portal
  namespace: harbor
 spec:
-  image: registry.bstein.dev/infra/harbor-portal
+  image: goharbor/harbor-portal
  interval: 5m0s
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
@ -65,11 +65,11 @@ spec:
  imageRepositoryRef:
    name: harbor-portal
  filterTags:
-    pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
+    pattern: '^v(?P<version>\d+\.\d+\.\d+)$'
    extract: '$version'
  policy:
    semver:
-      range: ">=2.14.0-0 <2.15.0-0"
+      range: ">=2.14.0 <2.15.0"
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImageRepository
@ -77,7 +77,7 @@ metadata:
  name: harbor-registry
  namespace: harbor
 spec:
-  image: registry.bstein.dev/infra/harbor-registry
+  image: goharbor/registry-photon
  interval: 5m0s
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
@ -89,11 +89,11 @@ spec:
  imageRepositoryRef:
    name: harbor-registry
  filterTags:
-    pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
+    pattern: '^v(?P<version>\d+\.\d+\.\d+)$'
    extract: '$version'
  policy:
    semver:
-      range: ">=2.14.0-0 <2.15.0-0"
+      range: ">=2.14.0 <2.15.0"
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImageRepository
@ -101,7 +101,7 @@ metadata:
  name: harbor-registryctl
  namespace: harbor
 spec:
-  image: registry.bstein.dev/infra/harbor-registryctl
+  image: goharbor/harbor-registryctl
  interval: 5m0s
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
@ -113,11 +113,11 @@ spec:
  imageRepositoryRef:
    name: harbor-registryctl
  filterTags:
-    pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
+    pattern: '^v(?P<version>\d+\.\d+\.\d+)$'
    extract: '$version'
  policy:
    semver:
-      range: ">=2.14.0-0 <2.15.0-0"
+      range: ">=2.14.0 <2.15.0"
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImageRepository
@ -125,7 +125,7 @@ metadata:
  name: harbor-redis
  namespace: harbor
 spec:
-  image: registry.bstein.dev/infra/harbor-redis
+  image: goharbor/redis-photon
  interval: 5m0s
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
@ -137,11 +137,11 @@ spec:
  imageRepositoryRef:
    name: harbor-redis
  filterTags:
-    pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
+    pattern: '^v(?P<version>\d+\.\d+\.\d+)$'
    extract: '$version'
  policy:
    semver:
-      range: ">=2.14.0-0 <2.15.0-0"
+      range: ">=2.14.0 <2.15.0"
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImageRepository
@ -149,7 +149,7 @@ metadata:
  name: harbor-nginx
  namespace: harbor
 spec:
-  image: registry.bstein.dev/infra/harbor-nginx
+  image: goharbor/nginx-photon
  interval: 5m0s
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
@ -161,11 +161,11 @@ spec:
  imageRepositoryRef:
    name: harbor-nginx
  filterTags:
-    pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
+    pattern: '^v(?P<version>\d+\.\d+\.\d+)$'
    extract: '$version'
  policy:
    semver:
-      range: ">=2.14.0-0 <2.15.0-0"
+      range: ">=2.14.0 <2.15.0"
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImageRepository
@ -173,7 +173,7 @@ metadata:
  name: harbor-prepare
  namespace: harbor
 spec:
-  image: registry.bstein.dev/infra/harbor-prepare
+  image: goharbor/prepare
  interval: 5m0s
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
@ -185,8 +185,8 @@ spec:
  imageRepositoryRef:
    name: harbor-prepare
  filterTags:
-    pattern: '^v(?P<version>\d+\.\d+\.\d+-arm64(\.\d+)?)$'
+    pattern: '^v(?P<version>\d+\.\d+\.\d+)$'
    extract: '$version'
  policy:
    semver:
-      range: ">=2.14.0-0 <2.15.0-0"
+      range: ">=2.14.0 <2.15.0"
--- a/services/pegasus/deployment.yaml
+++ b/services/pegasus/deployment.yaml
@ -73,7 +73,7 @@ spec:
      containers:
        - name: pegasus
          image: registry.bstein.dev/streaming/pegasus-vault:1.2.32 # {"$imagepolicy": "jellyfin:pegasus:tag"}
-          imagePullPolicy: Always
+          imagePullPolicy: IfNotPresent
          env:
            - name: PEGASUS_MEDIA_ROOT
              valueFrom: { configMapKeyRef: { name: pegasus-config, key: PEGASUS_MEDIA_ROOT } }