#!/usr/bin/env bash set -euo pipefail usage() { cat <<'USAGE' Usage: scripts/cluster_power_recovery.sh shutdown [options] scripts/cluster_power_recovery.sh startup [options] Options: --execute Actually run commands (default is dry-run) --ssh-user SSH user for node commands (default: current SSH config user) --control-planes Control plane hosts (default: titan-0a,titan-0b,titan-0c) --workers Worker hosts (default: static atlas inventory, with API discovery when available) --expected-flux-branch Expected Flux source branch during startup checks (default: main) --skip-etcd-snapshot Skip etcd snapshot before shutdown --skip-drain Skip worker drain during shutdown --skip-local-bootstrap Startup: skip local bootstrap fallback applies --skip-harbor-bootstrap Startup: skip Harbor recovery bootstrap stage --force-flux-branch Startup: patch flux-system GitRepository branch to this value --min-startup-battery Minimum UPS percent required before bootstrap (default: 35) --ups-host UPS identifier for upsc (default: ups@localhost) --ups-battery-key UPS battery key for upsc (default: battery.charge) --recovery-state-file Recovery state file for second-outage detection --drain-timeout Worker drain timeout for normal shutdown (default: 180) --emergency-drain-timeout Worker drain timeout for emergency fallback (default: 45) --require-ups-battery Hard-fail startup if UPS battery cannot be read -h, --help Show help Examples: scripts/cluster_power_recovery.sh shutdown --execute scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main USAGE } MODE="${1:-}" if [[ -z "${MODE}" || "${MODE}" == "-h" || "${MODE}" == "--help" ]]; then usage exit 0 fi shift || true if [[ "${MODE}" != "shutdown" && "${MODE}" != "startup" ]]; then echo "Unknown mode: ${MODE}" >&2 usage exit 1 fi EXECUTE=0 SSH_USER="" CONTROL_PLANES="titan-0a,titan-0b,titan-0c" WORKERS="" DEFAULT_WORKERS="titan-04,titan-05,titan-06,titan-07,titan-08,titan-09,titan-10,titan-11,titan-12,titan-13,titan-14,titan-15,titan-17,titan-18,titan-19,titan-20,titan-21,titan-22,titan-24" EXPECTED_FLUX_BRANCH="main" SKIP_ETCD_SNAPSHOT=0 SKIP_DRAIN=0 SKIP_LOCAL_BOOTSTRAP=0 SKIP_HARBOR_BOOTSTRAP=0 FORCE_FLUX_BRANCH="" UPS_HOST="ups@localhost" UPS_BATTERY_KEY="battery.charge" RECOVERY_STATE_FILE="${HOME}/.local/state/cluster_power_recovery.state" MIN_STARTUP_BATTERY=35 DRAIN_TIMEOUT_SECONDS=180 EMERGENCY_DRAIN_TIMEOUT_SECONDS=45 REQUIRE_UPS_BATTERY=0 RECOVERY_PENDING=0 STARTUP_ATTEMPTED_DURING_OUTAGE=0 while [[ $# -gt 0 ]]; do case "$1" in --execute) EXECUTE=1 shift ;; --ssh-user) SSH_USER="${2:-}" shift 2 ;; --control-planes) CONTROL_PLANES="${2:-}" shift 2 ;; --workers) WORKERS="${2:-}" shift 2 ;; --expected-flux-branch) EXPECTED_FLUX_BRANCH="${2:-}" shift 2 ;; --skip-etcd-snapshot) SKIP_ETCD_SNAPSHOT=1 shift ;; --skip-drain) SKIP_DRAIN=1 shift ;; --skip-local-bootstrap) SKIP_LOCAL_BOOTSTRAP=1 shift ;; --skip-harbor-bootstrap) SKIP_HARBOR_BOOTSTRAP=1 shift ;; --force-flux-branch) FORCE_FLUX_BRANCH="${2:-}" shift 2 ;; --ups-host) UPS_HOST="${2:-}" shift 2 ;; --ups-battery-key) UPS_BATTERY_KEY="${2:-}" shift 2 ;; --min-startup-battery) MIN_STARTUP_BATTERY="${2:-}" shift 2 ;; --recovery-state-file) RECOVERY_STATE_FILE="${2:-}" shift 2 ;; --drain-timeout) DRAIN_TIMEOUT_SECONDS="${2:-}" shift 2 ;; --emergency-drain-timeout) EMERGENCY_DRAIN_TIMEOUT_SECONDS="${2:-}" shift 2 ;; --require-ups-battery) REQUIRE_UPS_BATTERY=1 shift ;; -h|--help) usage exit 0 ;; *) echo "Unknown option: $1" >&2 usage exit 1 ;; esac done require_cmd() { local cmd="$1" if ! command -v "${cmd}" >/dev/null 2>&1; then echo "Missing required command: ${cmd}" >&2 exit 1 fi } require_cmd kubectl require_cmd ssh log() { echo "[cluster-power] $*"; } warn() { echo "[cluster-power][warn] $*" >&2; } run() { if [[ "${EXECUTE}" -eq 1 ]]; then log "EXEC: $*" "$@" else log "DRY-RUN: $*" fi } run_shell() { if [[ "${EXECUTE}" -eq 1 ]]; then log "EXEC: $*" bash -lc "$*" else log "DRY-RUN: $*" fi } as_array_from_csv() { local csv="$1" local out_var="$2" local old_ifs="${IFS}" IFS=',' read -r -a _tmp <<< "${csv}" IFS="${old_ifs}" eval "${out_var}"'=( "${_tmp[@]}" )' } ssh_target() { local node="$1" if [[ -n "${SSH_USER}" ]]; then printf "%s@%s" "${SSH_USER}" "${node}" else printf "%s" "${node}" fi } discover_workers_csv() { # Include every non-control-plane node by default (workers + accelerators). kubectl get nodes \ -o custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\\.kubernetes\\.io/control-plane,MASTER:.metadata.labels.node-role\\.kubernetes\\.io/master \ --no-headers \ | awk '$2=="" && $3=="" {print $1}' \ | paste -sd, - } load_recovery_state() { if [[ ! -f "${RECOVERY_STATE_FILE}" ]]; then RECOVERY_PENDING=0 STARTUP_ATTEMPTED_DURING_OUTAGE=0 return 0 fi while IFS='=' read -r key value; do case "${key}" in recovery_pending) RECOVERY_PENDING="${value}" ;; startup_attempted) STARTUP_ATTEMPTED_DURING_OUTAGE="${value}" ;; esac done < "${RECOVERY_STATE_FILE}" } save_recovery_state() { mkdir -p "$(dirname "${RECOVERY_STATE_FILE}")" cat > "${RECOVERY_STATE_FILE}" </dev/null 2>&1; then return 1 fi local raw raw="$(upsc "${UPS_HOST}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)" if [[ -z "${raw}" ]]; then return 1 fi # battery.charge can include units/decimals in some setups; normalize. raw="${raw%%.*}" if ! [[ "${raw}" =~ ^[0-9]+$ ]]; then return 1 fi echo "${raw}" } ensure_minimum_battery_for_bootstrap() { local battery battery="$(read_ups_battery || true)" if [[ -z "${battery}" ]]; then if [[ "${REQUIRE_UPS_BATTERY}" -eq 1 ]]; then warn "Unable to read UPS battery status and --require-ups-battery is set." return 1 fi warn "Unable to read UPS battery status; continuing without hard battery gating." return 0 fi log "ups-battery=${battery}%" if (( battery < MIN_STARTUP_BATTERY )); then warn "UPS battery ${battery}% below minimum startup threshold ${MIN_STARTUP_BATTERY}%." return 1 fi return 0 } emergency_shutdown_after_outage() { warn "Entering outage-aware emergency shutdown path due insufficient startup budget." patch_flux_suspend_all true || true best_effort_scale_down_apps # Give the cluster one short chance to drain, then force progress. best_effort_drain_workers "${EMERGENCY_DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}" stop_workers_agents "${WORKER_NODES[@]}" stop_control_planes "${CONTROL_PLANE_NODES[@]}" } patch_flux_suspend_all() { local value="$1" local patch patch=$(printf '{"spec":{"suspend":%s}}' "${value}") local ks_list hr_list ks_list="$(kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' || true)" hr_list="$(kubectl get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' || true)" while IFS= read -r k; do [[ -z "${k}" ]] && continue run kubectl -n flux-system patch kustomization "${k}" --type=merge -p "${patch}" done <<< "${ks_list}" while IFS= read -r hr; do [[ -z "${hr}" ]] && continue local ns="${hr%%/*}" local name="${hr##*/}" run kubectl -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" done <<< "${hr_list}" } report_flux_source_state() { local flux_url flux_branch flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)" flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)" if [[ -n "${flux_url}" ]]; then log "flux-source-url=${flux_url}" fi if [[ -n "${flux_branch}" ]]; then log "flux-source-branch=${flux_branch}" if [[ "${MODE}" == "startup" && -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then warn "Flux source branch is '${flux_branch}'. Expected '${EXPECTED_FLUX_BRANCH}' for canonical cold-start recovery. Use --force-flux-branch ${EXPECTED_FLUX_BRANCH} if needed." fi fi } wait_for_api() { local attempts="${1:-90}" local sleep_s="${2:-2}" if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: skipping live Kubernetes API wait" return 0 fi local i for i in $(seq 1 "${attempts}"); do if kubectl version --request-timeout=5s >/dev/null 2>&1; then return 0 fi sleep "${sleep_s}" done return 1 } best_effort_scale_down_apps() { local excludes='^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$' local ns_list ns_list="$(kubectl get ns -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')" while IFS= read -r ns; do [[ -z "${ns}" ]] && continue if [[ "${ns}" =~ ${excludes} ]]; then continue fi run_shell "kubectl -n ${ns} scale deployment --all --replicas=0 || true" run_shell "kubectl -n ${ns} scale statefulset --all --replicas=0 || true" done <<< "${ns_list}" } best_effort_drain_workers() { local timeout_seconds="$1" shift || true local workers=("$@") local node for node in "${workers[@]}"; do [[ -z "${node}" ]] && continue run kubectl cordon "${node}" if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s"; then continue fi warn "Gentle drain timed out for ${node}; retrying with --force." if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force"; then continue fi warn "Force drain timed out for ${node}; final attempt with --disable-eviction." run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force --disable-eviction || true" done } stop_workers_agents() { local workers=("$@") local node target for node in "${workers[@]}"; do [[ -z "${node}" ]] && continue target="$(ssh_target "${node}")" run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl stop k3s-agent || true" done } start_workers_agents() { local workers=("$@") local node target for node in "${workers[@]}"; do [[ -z "${node}" ]] && continue target="$(ssh_target "${node}")" run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl start k3s-agent || true" done } stop_control_planes() { local cps=("$@") local node target for node in "${cps[@]}"; do [[ -z "${node}" ]] && continue target="$(ssh_target "${node}")" run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl stop k3s || true" done } start_control_planes() { local cps=("$@") local node target for node in "${cps[@]}"; do [[ -z "${node}" ]] && continue target="$(ssh_target "${node}")" run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl start k3s || true" done } take_etcd_snapshot() { local cp="$1" local target target="$(ssh_target "${cp}")" local ts ts="$(date +%Y%m%d-%H%M%S)" run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" \ "sudo k3s etcd-snapshot save --name pre-shutdown-${ts}" } bootstrap_local_minimal() { # Local apply path to break Flux<->Gitea boot deadlock during cold-start recovery. # Longhorn is applied before stateful workloads so astreae-backed PVCs can bind. run kubectl apply -k infrastructure/core run kubectl apply -k infrastructure/sources/helm run kubectl apply -k infrastructure/longhorn/core run kubectl apply -k infrastructure/metallb run kubectl apply -k infrastructure/traefik run kubectl apply -k infrastructure/vault-csi run kubectl apply -k infrastructure/vault-injector run kubectl apply -k services/vault run kubectl apply -k infrastructure/postgres run kubectl apply -k services/gitea } bootstrap_local_harbor() { # Optional Harbor bootstrap stage for environments where Harbor is authoritative for images. run kubectl apply -k services/harbor } resume_flux_and_reconcile() { patch_flux_suspend_all false if command -v flux >/dev/null 2>&1; then run flux reconcile source git flux-system -n flux-system --timeout=3m run flux reconcile kustomization core -n flux-system --with-source --timeout=5m run flux reconcile kustomization helm -n flux-system --with-source --timeout=5m run flux reconcile kustomization longhorn -n flux-system --with-source --timeout=15m run flux reconcile kustomization metallb -n flux-system --with-source --timeout=5m run flux reconcile kustomization traefik -n flux-system --with-source --timeout=5m run flux reconcile kustomization vault-csi -n flux-system --with-source --timeout=5m run flux reconcile kustomization vault-injector -n flux-system --with-source --timeout=5m run flux reconcile kustomization vault -n flux-system --with-source --timeout=10m run flux reconcile kustomization postgres -n flux-system --with-source --timeout=10m run flux reconcile kustomization gitea -n flux-system --with-source --timeout=10m run flux reconcile kustomization harbor -n flux-system --with-source --timeout=15m else local now now="$(date --iso-8601=seconds)" run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="${now}" --overwrite fi } as_array_from_csv "${CONTROL_PLANES}" CONTROL_PLANE_NODES if [[ -z "${WORKERS}" ]]; then WORKERS="$(discover_workers_csv 2>/dev/null || true)" if [[ -z "${WORKERS}" ]]; then warn "Unable to auto-discover workers from the API; falling back to static atlas worker inventory." WORKERS="${DEFAULT_WORKERS}" fi fi as_array_from_csv "${WORKERS}" WORKER_NODES load_recovery_state log "mode=${MODE} execute=${EXECUTE}" log "control-planes=${CONTROL_PLANES}" log "workers=${WORKERS}" log "recovery-state-file=${RECOVERY_STATE_FILE}" log "recovery_pending=${RECOVERY_PENDING} startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}" report_flux_source_state if [[ "${MODE}" == "shutdown" ]]; then save_recovery_state 1 0 if [[ "${SKIP_ETCD_SNAPSHOT}" -eq 0 ]]; then take_etcd_snapshot "${CONTROL_PLANE_NODES[0]}" else warn "Skipping etcd snapshot by request." fi patch_flux_suspend_all true best_effort_scale_down_apps if [[ "${SKIP_DRAIN}" -eq 0 ]]; then best_effort_drain_workers "${DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}" else warn "Skipping worker drain by request." fi stop_workers_agents "${WORKER_NODES[@]}" stop_control_planes "${CONTROL_PLANE_NODES[@]}" log "Shutdown flow complete." exit 0 fi # Startup mode if [[ "${RECOVERY_PENDING}" -eq 1 ]]; then if ! ensure_minimum_battery_for_bootstrap; then if [[ "${STARTUP_ATTEMPTED_DURING_OUTAGE}" -eq 1 ]]; then emergency_shutdown_after_outage exit 1 fi warn "Startup deferred due low battery after recent outage; marking for second-outage fallback." save_recovery_state 1 1 exit 1 fi save_recovery_state 1 1 fi start_control_planes "${CONTROL_PLANE_NODES[@]}" start_workers_agents "${WORKER_NODES[@]}" if ! wait_for_api 120 2; then warn "Kubernetes API did not become reachable in time." exit 1 fi if [[ -n "${FORCE_FLUX_BRANCH}" ]]; then run kubectl -n flux-system patch gitrepository flux-system --type=merge \ -p "{\"spec\":{\"ref\":{\"branch\":\"${FORCE_FLUX_BRANCH}\"}}}" fi if [[ "${SKIP_LOCAL_BOOTSTRAP}" -eq 0 ]]; then # If source is not ready, bootstrap critical pieces from local checkout first. if ! kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q True; then warn "Flux source not Ready; executing local bootstrap fallback path." bootstrap_local_minimal if [[ "${SKIP_HARBOR_BOOTSTRAP}" -eq 0 ]]; then bootstrap_local_harbor else warn "Skipping Harbor bootstrap fallback by request." fi fi else warn "Skipping local bootstrap fallback by request." fi resume_flux_and_reconcile clear_recovery_state log "Startup flow complete."