#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_DIR="${HECATE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}" BOOTSTRAP_DIR="${SCRIPT_DIR}/bootstrap" CONFIG_FILE="${BOOTSTRAP_DIR}/recovery-config.env" if [[ -f "${CONFIG_FILE}" ]]; then # shellcheck disable=SC1090 source "${CONFIG_FILE}" fi if [[ -z "${KUBECONFIG:-}" && -f "${SCRIPT_DIR}/kubeconfig" ]]; then export KUBECONFIG="${SCRIPT_DIR}/kubeconfig" fi usage() { cat < [options] Options: --execute Actually run commands (default is dry-run) --expected-flux-branch Expected Flux source branch during startup checks (default: ${DEFAULT_FLUX_BRANCH:-main}) --force-flux-branch Startup: patch flux-system GitRepository branch to this value --skip-etcd-snapshot Shutdown: skip etcd snapshot before shutdown --skip-drain Shutdown: skip worker drain during shutdown --skip-local-bootstrap Startup: skip local bootstrap fallback applies --skip-harbor-bootstrap Startup: skip Harbor recovery bootstrap stage --skip-harbor-seed Startup: skip Harbor image seed/import stage --skip-helper-prewarm Prepare/Shutdown/Startup: skip node-helper prewarm --min-startup-battery Minimum UPS percent required before bootstrap (default: 35) --ups-host UPS identifier for upsc (default: ups@localhost) --ups-battery-key UPS battery key for upsc (default: battery.charge) --recovery-state-file Recovery state file for outage-aware restart logic --harbor-bundle-file Harbor bootstrap bundle on the control host --harbor-target-node Node that should host Harbor during bootstrap (default: ${HARBOR_TARGET_NODE:-titan-05}) --harbor-canary-image Harbor-backed image used for pull canary (default: ${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}) --node-helper-image Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0}) --bundle-http-port Temporary HTTP port used to serve bootstrap bundles (default: ${BUNDLE_HTTP_PORT:-8877}) --api-wait-timeout Startup: Kubernetes API wait timeout (default: 600) --drain-timeout Worker drain timeout for normal shutdown (default: 180) --emergency-drain-timeout Worker drain timeout for emergency fallback (default: 45) --require-ups-battery Hard-fail startup if UPS battery cannot be read -h, --help Show help Examples: scripts/cluster_power_recovery.sh prepare --execute scripts/cluster_power_recovery.sh harbor-seed --execute scripts/cluster_power_recovery.sh status scripts/cluster_power_recovery.sh shutdown --execute scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main USAGE } MODE="${1:-}" if [[ -z "${MODE}" || "${MODE}" == "-h" || "${MODE}" == "--help" ]]; then usage exit 0 fi shift || true case "${MODE}" in prepare|status|harbor-seed|shutdown|startup) ;; *) echo "Unknown mode: ${MODE}" >&2 usage exit 1 ;; esac EXECUTE=0 EXPECTED_FLUX_BRANCH="${DEFAULT_FLUX_BRANCH:-main}" FORCE_FLUX_BRANCH="" SKIP_ETCD_SNAPSHOT=0 SKIP_DRAIN=0 SKIP_LOCAL_BOOTSTRAP=0 SKIP_HARBOR_BOOTSTRAP=0 SKIP_HARBOR_SEED=0 SKIP_HELPER_PREWARM=0 UPS_HOST="${UPS_HOST:-ups@localhost}" UPS_BATTERY_KEY="${UPS_BATTERY_KEY:-battery.charge}" MIN_STARTUP_BATTERY="${MIN_STARTUP_BATTERY:-35}" REQUIRE_UPS_BATTERY="${REQUIRE_UPS_BATTERY:-0}" DRAIN_TIMEOUT_SECONDS=180 EMERGENCY_DRAIN_TIMEOUT_SECONDS=45 API_WAIT_TIMEOUT_SECONDS=600 BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}" STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/hecate}" RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state" HARBOR_BUNDLE_FILE="${STATE_ROOT}/bundles/${HARBOR_BUNDLE_BASENAME:-harbor-bootstrap-v2.14.1-arm64.tar.zst}" HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-titan-05}" HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-titan-04}" HARBOR_CANARY_IMAGE="${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}" NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0}" NODE_HELPER_NAMESPACE="${NODE_HELPER_NAMESPACE:-maintenance}" NODE_HELPER_SERVICE_ACCOUNT="${NODE_HELPER_SERVICE_ACCOUNT:-default}" REGISTRY_PULL_SECRET="${REGISTRY_PULL_SECRET:-harbor-regcred}" RECOVERY_PENDING=0 STARTUP_ATTEMPTED_DURING_OUTAGE=0 LAST_CHECKPOINT="none" BUNDLE_SERVER_PID="" UPS_HOST_IN_USE="" while [[ $# -gt 0 ]]; do case "$1" in --execute) EXECUTE=1 shift ;; --expected-flux-branch) EXPECTED_FLUX_BRANCH="${2:?missing branch}" shift 2 ;; --force-flux-branch) FORCE_FLUX_BRANCH="${2:?missing branch}" shift 2 ;; --skip-etcd-snapshot) SKIP_ETCD_SNAPSHOT=1 shift ;; --skip-drain) SKIP_DRAIN=1 shift ;; --skip-local-bootstrap) SKIP_LOCAL_BOOTSTRAP=1 shift ;; --skip-harbor-bootstrap) SKIP_HARBOR_BOOTSTRAP=1 shift ;; --skip-harbor-seed) SKIP_HARBOR_SEED=1 shift ;; --skip-helper-prewarm) SKIP_HELPER_PREWARM=1 shift ;; --ups-host) UPS_HOST="${2:?missing ups host}" shift 2 ;; --ups-battery-key) UPS_BATTERY_KEY="${2:?missing ups key}" shift 2 ;; --min-startup-battery) MIN_STARTUP_BATTERY="${2:?missing battery threshold}" shift 2 ;; --require-ups-battery) REQUIRE_UPS_BATTERY=1 shift ;; --recovery-state-file) RECOVERY_STATE_FILE="${2:?missing state file path}" shift 2 ;; --harbor-bundle-file) HARBOR_BUNDLE_FILE="${2:?missing bundle file path}" shift 2 ;; --harbor-target-node) HARBOR_TARGET_NODE="${2:?missing harbor target node}" shift 2 ;; --harbor-canary-image) HARBOR_CANARY_IMAGE="${2:?missing canary image}" shift 2 ;; --node-helper-image) NODE_HELPER_IMAGE="${2:?missing node helper image}" shift 2 ;; --bundle-http-port) BUNDLE_HTTP_PORT="${2:?missing bundle http port}" shift 2 ;; --api-wait-timeout) API_WAIT_TIMEOUT_SECONDS="${2:?missing api wait timeout}" shift 2 ;; --drain-timeout) DRAIN_TIMEOUT_SECONDS="${2:?missing drain timeout}" shift 2 ;; --emergency-drain-timeout) EMERGENCY_DRAIN_TIMEOUT_SECONDS="${2:?missing emergency drain timeout}" shift 2 ;; -h|--help) usage exit 0 ;; *) echo "Unknown option: $1" >&2 usage exit 1 ;; esac done require_cmd() { local cmd="$1" if ! command -v "${cmd}" >/dev/null 2>&1; then echo "Missing required command: ${cmd}" >&2 exit 1 fi } require_cmd kubectl require_cmd bash require_cmd base64 require_cmd curl log() { echo "[cluster-power] $*"; } warn() { echo "[cluster-power][warn] $*" >&2; } die() { echo "[cluster-power][error] $*" >&2; exit 1; } run() { if [[ "${EXECUTE}" -eq 1 ]]; then log "EXEC: $*" "$@" else log "DRY-RUN: $*" fi } run_shell() { if [[ "${EXECUTE}" -eq 1 ]]; then log "EXEC: $*" bash -lc "$*" else log "DRY-RUN: $*" fi } apply_kustomization() { local path="$1" local full_path="${REPO_DIR}/${path}" if [[ "${EXECUTE}" -eq 1 ]]; then log "EXEC: kubectl kustomize ${full_path} --load-restrictor=LoadRestrictionsNone | kubectl apply -f -" kubectl kustomize "${full_path}" --load-restrictor=LoadRestrictionsNone | kubectl apply -f - else log "DRY-RUN: kubectl kustomize ${full_path} --load-restrictor=LoadRestrictionsNone | kubectl apply -f -" fi } sanitize_name() { printf '%s' "$1" | tr '[:upper:]' '[:lower:]' | tr -cs 'a-z0-9-' '-' } state_dir() { dirname "${RECOVERY_STATE_FILE}" } load_recovery_state() { RECOVERY_PENDING=0 STARTUP_ATTEMPTED_DURING_OUTAGE=0 LAST_CHECKPOINT="none" [[ -f "${RECOVERY_STATE_FILE}" ]] || return 0 while IFS='=' read -r key value; do case "${key}" in recovery_pending) RECOVERY_PENDING="${value}" ;; startup_attempted) STARTUP_ATTEMPTED_DURING_OUTAGE="${value}" ;; last_checkpoint) LAST_CHECKPOINT="${value}" ;; esac done < "${RECOVERY_STATE_FILE}" } save_recovery_state() { [[ "${EXECUTE}" -eq 1 ]] || return 0 mkdir -p "$(state_dir)" cat > "${RECOVERY_STATE_FILE}" </dev/null || true LAST_CHECKPOINT="none" } sanitize_battery_percent() { local raw="$1" raw="${raw##*:}" raw="${raw//[[:space:]]/}" raw="${raw%%.*}" [[ "${raw}" =~ ^[0-9]+$ ]] || return 1 printf '%s' "${raw}" } candidate_ups_hosts() { local candidate name local -A seen=() if [[ -n "${UPS_HOST}" ]]; then seen["${UPS_HOST}"]=1 echo "${UPS_HOST}" fi while IFS= read -r name; do [[ -n "${name}" ]] || continue for candidate in "${name}@localhost" "${name}"; do [[ -n "${seen[${candidate}]+x}" ]] && continue seen["${candidate}"]=1 echo "${candidate}" done done < <(upsc -l 2>/dev/null || true) } read_ups_battery() { if ! command -v upsc >/dev/null 2>&1; then return 1 fi local host raw parsed while IFS= read -r host; do raw="$(upsc "${host}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)" [[ -n "${raw}" ]] || continue parsed="$(sanitize_battery_percent "${raw}" || true)" [[ -n "${parsed}" ]] || continue UPS_HOST_IN_USE="${host}" printf '%s' "${parsed}" return 0 done < <(candidate_ups_hosts) return 1 } ensure_minimum_battery_for_bootstrap() { local battery battery="$(read_ups_battery || true)" if [[ -z "${battery}" ]]; then if [[ "${REQUIRE_UPS_BATTERY}" -eq 1 ]]; then warn "Unable to read UPS battery status and --require-ups-battery is set." return 1 fi warn "Unable to read UPS battery status; continuing without hard battery gating." return 0 fi log "ups-battery=${battery}% host=${UPS_HOST_IN_USE:-${UPS_HOST}}" if (( battery < MIN_STARTUP_BATTERY )); then warn "UPS battery ${battery}% below minimum startup threshold ${MIN_STARTUP_BATTERY}%." return 1 fi return 0 } report_flux_source_state() { local flux_url flux_branch flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)" flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)" [[ -n "${flux_url}" ]] && log "flux-source-url=${flux_url}" if [[ -n "${flux_branch}" ]]; then log "flux-source-branch=${flux_branch}" if [[ "${MODE}" == "startup" && -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then warn "Flux source branch is '${flux_branch}'. Expected '${EXPECTED_FLUX_BRANCH}' for canonical recovery." fi fi } wait_for_api() { local attempts=$(( API_WAIT_TIMEOUT_SECONDS / 5 )) if (( attempts < 1 )); then attempts=1 fi if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: skipping live Kubernetes API wait" return 0 fi local i for i in $(seq 1 "${attempts}"); do if kubectl version --request-timeout=5s >/dev/null 2>&1; then return 0 fi sleep 5 done return 1 } patch_flux_suspend_all() { local value="$1" local patch patch=$(printf '{"spec":{"suspend":%s}}' "${value}") local ks_list hr_list ks_list="$(kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' || true)" hr_list="$(kubectl get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' || true)" while IFS= read -r k; do [[ -z "${k}" ]] && continue run kubectl -n flux-system patch kustomization "${k}" --type=merge -p "${patch}" done <<< "${ks_list}" while IFS= read -r hr; do [[ -z "${hr}" ]] && continue local ns="${hr%%/*}" local name="${hr##*/}" run kubectl -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" done <<< "${hr_list}" } best_effort_scale_down_apps() { local excludes='^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$' local ns_list ns_list="$(kubectl get ns -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')" while IFS= read -r ns; do [[ -z "${ns}" ]] && continue if [[ "${ns}" =~ ${excludes} ]]; then continue fi run_shell "kubectl -n ${ns} scale deployment --all --replicas=0 || true" run_shell "kubectl -n ${ns} scale statefulset --all --replicas=0 || true" done <<< "${ns_list}" } discover_workers_csv() { kubectl get nodes \ -o custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\\.kubernetes\\.io/control-plane,MASTER:.metadata.labels.node-role\\.kubernetes\\.io/master \ --no-headers \ | awk '$2=="" && $3=="" {print $1}' \ | paste -sd, - } as_array_from_csv() { local csv="$1" local out_var="$2" local old_ifs="${IFS}" IFS=',' read -r -a _tmp <<< "${csv}" IFS="${old_ifs}" eval "${out_var}"'=( "${_tmp[@]}" )' } best_effort_drain_workers() { local timeout_seconds="$1" shift || true local workers=("$@") local node for node in "${workers[@]}"; do [[ -z "${node}" ]] && continue run kubectl cordon "${node}" if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s"; then continue fi warn "Gentle drain timed out for ${node}; retrying with --force." if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force"; then continue fi warn "Force drain timed out for ${node}; final attempt with --disable-eviction." run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force --disable-eviction || true" done } wait_for_rollout() { local namespace="$1" local kind="$2" local name="$3" local timeout="$4" if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: kubectl -n ${namespace} rollout status ${kind}/${name} --timeout=${timeout}" return 0 fi kubectl -n "${namespace}" rollout status "${kind}/${name}" --timeout="${timeout}" } check_ingress_stack() { kubectl get ingressclass traefik >/dev/null wait_for_rollout traefik deployment traefik 5m } check_longhorn_stack() { wait_for_rollout longhorn-system daemonset longhorn-manager 10m wait_for_rollout longhorn-system deployment longhorn-ui 10m } check_vault_stack() { wait_for_rollout vault statefulset vault 10m if [[ "${EXECUTE}" -eq 1 ]]; then kubectl -n vault exec vault-0 -- sh -ceu 'VAULT_ADDR=http://127.0.0.1:8200 vault status >/dev/null' fi } check_postgres_stack() { wait_for_rollout postgres statefulset postgres 10m if [[ "${EXECUTE}" -eq 1 ]]; then kubectl -n postgres exec postgres-0 -c postgres -- sh -ceu 'pg_isready -h 127.0.0.1 -p 5432 >/dev/null' fi } check_gitea_stack() { wait_for_rollout gitea deployment gitea 10m } check_harbor_stack() { wait_for_rollout harbor statefulset harbor-redis 10m wait_for_rollout harbor deployment harbor-core 10m wait_for_rollout harbor deployment harbor-jobservice 10m wait_for_rollout harbor deployment harbor-portal 10m wait_for_rollout harbor deployment harbor-registry 10m } check_harbor_endpoint() { if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/" return 0 fi local code code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)" case "${code}" in 200|401) log "harbor-endpoint=http-${code}" ;; *) die "Harbor endpoint check failed with HTTP ${code:-unknown}" ;; esac } wait_for_pod_phase() { local namespace="$1" local pod="$2" local expected_phase="$3" local timeout_seconds="$4" local start now phase start="$(date +%s)" while true; do phase="$(kubectl -n "${namespace}" get pod "${pod}" -o jsonpath='{.status.phase}' 2>/dev/null || true)" if [[ "${phase}" == "${expected_phase}" ]]; then return 0 fi if [[ "${phase}" == "Failed" ]]; then return 1 fi now="$(date +%s)" if (( now - start >= timeout_seconds )); then return 1 fi sleep 2 done } harbor_is_ready() { kubectl -n harbor get deploy harbor-core harbor-jobservice harbor-portal harbor-registry >/dev/null 2>&1 || return 1 local code code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)" [[ "${code}" == "200" || "${code}" == "401" ]] } run_harbor_pull_canary() { local pod="hecate-harbor-canary" if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${HARBOR_CANARY_NODE}" return 0 fi timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true cat <&2 || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true return 1 fi timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true } run_helper_pod() { local node="$1" local purpose="$2" local timeout_seconds="$3" local script_content="$4" local pod="hecate-$(sanitize_name "${purpose}")-$(date +%H%M%S)" local encoded_script encoded_script="$(printf '%s' "${script_content}" | base64 -w0)" if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: helper pod ${pod} on ${node} for ${purpose}" return 0 fi cat </tmp/hecate-step.sh chmod +x /tmp/hecate-step.sh /tmp/hecate-step.sh POD if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded "${timeout_seconds}"; then kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true return 1 fi timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true } run_host_command_via_helper() { local node="$1" local purpose="$2" local timeout_seconds="$3" local host_command="$4" local encoded_command encoded_command="$(printf '%s' "${host_command}" | base64 -w0)" local script_content script_content=$(cat <