From 31f5709929e971a242302d1f0ca7d6445f8c600e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 6 Apr 2026 04:21:04 -0300 Subject: [PATCH] hecate: add cluster power recovery tooling --- dockerfiles/Dockerfile.hecate-node-helper | 12 + knowledge/runbooks/cluster-power-recovery.md | 107 ++ scripts/bootstrap/harbor-bootstrap-images.txt | 9 + scripts/bootstrap/recovery-config.env | 12 + scripts/build_harbor_bootstrap_bundle.sh | 58 ++ scripts/build_hecate_node_helper.sh | 56 + scripts/cluster_power_console.sh | 78 ++ scripts/cluster_power_recovery.sh | 981 ++++++++++++++++++ 8 files changed, 1313 insertions(+) create mode 100644 dockerfiles/Dockerfile.hecate-node-helper create mode 100644 knowledge/runbooks/cluster-power-recovery.md create mode 100644 scripts/bootstrap/harbor-bootstrap-images.txt create mode 100644 scripts/bootstrap/recovery-config.env create mode 100755 scripts/build_harbor_bootstrap_bundle.sh create mode 100755 scripts/build_hecate_node_helper.sh create mode 100755 scripts/cluster_power_console.sh create mode 100755 scripts/cluster_power_recovery.sh diff --git a/dockerfiles/Dockerfile.hecate-node-helper b/dockerfiles/Dockerfile.hecate-node-helper new file mode 100644 index 00000000..16e2d517 --- /dev/null +++ b/dockerfiles/Dockerfile.hecate-node-helper @@ -0,0 +1,12 @@ +FROM debian:bookworm-slim + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + bash \ + ca-certificates \ + curl \ + util-linux \ + zstd \ + && rm -rf /var/lib/apt/lists/* + +CMD ["/bin/sh"] diff --git a/knowledge/runbooks/cluster-power-recovery.md b/knowledge/runbooks/cluster-power-recovery.md new file mode 100644 index 00000000..ddcf3888 --- /dev/null +++ b/knowledge/runbooks/cluster-power-recovery.md @@ -0,0 +1,107 @@ +Atlas Cluster Power Recovery (Graceful Shutdown/Startup) + +Purpose +- Provide a safe operator flow for planned power events and cold-boot recovery. +- Avoid the Flux/Gitea bootstrap deadlock by using a local bootstrap fallback path. +- Break the Harbor self-hosting deadlock by seeding Harbor runtime images from a control-host bundle. +- Refuse bootstrap when UPS charge is too low, and fall back to fast shutdown if a second outage hits mid-recovery. + +Bootstrapping risk to remember +- Flux source is Git over SSH to `scm.bstein.dev` (Gitea). +- Gitea itself is a Flux-managed workload and depends on storage + database. +- Harbor is also critical, but it is not part of the first recovery stage because Harbor serves its own runtime images. +- On cold boot, if Flux cannot fetch source before Gitea is up, reconciliation can stall. +- Recovery path: bring control plane and workers up, then locally apply minimal platform stack (`core -> helm -> longhorn -> metallb -> traefik -> vault-csi -> vault-injector -> vault -> postgres -> gitea`), then seed Harbor images onto the Harbor node from a control-host bundle, then resume/reconcile Flux. Harbor is a later recovery stage after storage, Vault, Postgres, and Gitea are back. + +Script +- `scripts/cluster_power_recovery.sh` +- `scripts/cluster_power_console.sh` +- Modes: + - `prepare` + - `shutdown` + - `harbor-seed` + - `startup` + - `status` +- Default is dry-run. Add `--execute` to actually perform actions. + +Dry-run examples +- Shutdown preview: + - `scripts/cluster_power_recovery.sh shutdown --skip-etcd-snapshot --skip-drain` +- Startup preview: + - `scripts/cluster_power_recovery.sh startup` +- Harbor seed preview: + - `scripts/cluster_power_recovery.sh harbor-seed` + +Execute examples +- Prepare helper image on every node: + - `scripts/cluster_power_recovery.sh prepare --execute` +- Seed Harbor runtime images onto `titan-05` from the control-host bundle: + - `scripts/cluster_power_recovery.sh harbor-seed --execute` +- Planned shutdown: + - `scripts/cluster_power_recovery.sh shutdown --execute` +- Planned startup (canonical branch): + - `scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main` + +Manual remote console examples +- Canonical operator hosts: + - `titan-db` + - `titan-24` +- Both hosts now have: + - `~/hecate-tools/cluster_power_recovery.sh` + - `~/hecate-tools/cluster_power_console.sh` + - `~/hecate-tools/bootstrap/recovery-config.env` + - `~/hecate-tools/bootstrap/harbor-bootstrap-images.txt` + - `~/hecate-tools/kubeconfig` + - `~/hecate-cluster-power` + - `~/bin/hecate-cluster-power` + - `~/hecate-repo/{infrastructure,services,scripts}` +- Both hosts also keep the Harbor bootstrap bundle at: + - `~/.local/share/hecate/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst` +- Remote usage: + - `ssh titan-db` + - `~/hecate-cluster-power status` + - `~/hecate-cluster-power prepare --execute` + - `~/hecate-cluster-power shutdown --execute` + - `~/hecate-cluster-power startup --execute --force-flux-branch main` + - `ssh titan-24` + - `~/hecate-cluster-power status` + - `~/hecate-cluster-power prepare --execute` + - `~/hecate-cluster-power shutdown --execute` + - `~/hecate-cluster-power startup --execute --force-flux-branch main` + +Useful options +- `--expected-flux-branch main` +- `--force-flux-branch main` +- `--skip-local-bootstrap` (not recommended for cold-start recovery) +- `--skip-harbor-bootstrap` (skip the Harbor recovery stage if you know Harbor should stay deferred) +- `--skip-harbor-seed` (skip bundle import if Harbor images are already cached on the target node) +- `--skip-helper-prewarm` +- `--min-startup-battery 35` +- `--ups-host ups@localhost` +- `--require-ups-battery` +- `--drain-timeout 180` +- `--emergency-drain-timeout 45` +- `--recovery-state-file ~/.local/share/hecate/cluster_power_recovery.state` +- `--harbor-bundle-file ~/.local/share/hecate/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst` + +Operational notes +- The flow suspends Flux Kustomizations/HelmReleases during shutdown to prevent churn. +- Worker drain is no longer best-effort only. The script now escalates from normal drain, to `--force`, to `--disable-eviction` once the configured timeout is exhausted. +- During startup, if Flux source is not `Ready`, local bootstrap fallback is applied first using the repo snapshot under `~/hecate-repo`. +- Longhorn is reconciled before Vault/Postgres/Gitea so storage-backed services are not racing the volume layer. +- Harbor is reconciled after the first critical stateful services. +- Harbor bootstrap is now designed around a control-host bundle: + - Build the Harbor bundle locally with `scripts/build_harbor_bootstrap_bundle.sh`. + - Stage it on the operator host at `~/.local/share/hecate/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst`. + - Use `harbor-seed --execute` or a full `startup --execute` to stream/import that bundle onto `titan-05`. +- The Harbor bundle remains arm64-only because Harbor is pinned to arm64 nodes. The node-helper image is multi-arch because Hecate uses it across both arm64 and amd64 nodes during prepare/shutdown operations. +- Hecate uses a temporary privileged helper pod for host-side operations. The helper image is prewarmed with `prepare --execute` so later shutdown/startup steps do not stall on image pulls. +- The script persists outage state in `~/.local/state/cluster_power_recovery.state` by default. If startup is attempted during an outage window and power becomes unstable again, rerunning startup with insufficient UPS charge will flip into the emergency shutdown path instead of continuing to bootstrap. +- In dry-run mode, the script now skips the live API wait step so preview runs do not stall on an offline cluster. +- `harbor-seed --execute` was validated by: + - prewarming the helper image across all nodes + - streaming the Harbor bootstrap bundle to `titan-05` + - importing Harbor runtime images into host `containerd` + - successfully running a Harbor-backed canary pod (`harbor-canary-ok`) +- After bootstrap, Flux resources are resumed and reconciled. +- Keep this runbook aligned with `clusters/atlas/flux-system/gotk-sync.yaml`. diff --git a/scripts/bootstrap/harbor-bootstrap-images.txt b/scripts/bootstrap/harbor-bootstrap-images.txt new file mode 100644 index 00000000..fae7a6f6 --- /dev/null +++ b/scripts/bootstrap/harbor-bootstrap-images.txt @@ -0,0 +1,9 @@ +# Harbor cold-start bootstrap images. +registry.bstein.dev/infra/harbor-core:v2.14.1-arm64 +registry.bstein.dev/infra/harbor-jobservice:v2.14.1-arm64 +registry.bstein.dev/infra/harbor-portal:v2.14.1-arm64 +registry.bstein.dev/infra/harbor-registry:v2.14.1-arm64 +registry.bstein.dev/infra/harbor-registryctl:v2.14.1-arm64 +registry.bstein.dev/infra/harbor-redis:v2.14.1-arm64 +registry.bstein.dev/infra/harbor-nginx:v2.14.1-arm64 +registry.bstein.dev/infra/harbor-prepare:v2.14.1-arm64 diff --git a/scripts/bootstrap/recovery-config.env b/scripts/bootstrap/recovery-config.env new file mode 100644 index 00000000..2db3deaa --- /dev/null +++ b/scripts/bootstrap/recovery-config.env @@ -0,0 +1,12 @@ +CANONICAL_CONTROL_HOST="titan-db" +DEFAULT_FLUX_BRANCH="main" +STATE_SUBDIR=".local/share/hecate" +HARBOR_BUNDLE_BASENAME="harbor-bootstrap-v2.14.1-arm64.tar.zst" +HARBOR_TARGET_NODE="titan-05" +HARBOR_CANARY_NODE="titan-04" +HARBOR_CANARY_IMAGE="registry.bstein.dev/bstein/kubectl:1.35.0" +NODE_HELPER_IMAGE="registry.bstein.dev/bstein/hecate-node-helper:0.1.0" +NODE_HELPER_NAMESPACE="maintenance" +NODE_HELPER_SERVICE_ACCOUNT="default" +REGISTRY_PULL_SECRET="harbor-regcred" +BUNDLE_HTTP_PORT="8877" diff --git a/scripts/build_harbor_bootstrap_bundle.sh b/scripts/build_harbor_bootstrap_bundle.sh new file mode 100755 index 00000000..ff64e55b --- /dev/null +++ b/scripts/build_harbor_bootstrap_bundle.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +set -euo pipefail + +IMAGES_FILE="scripts/bootstrap/harbor-bootstrap-images.txt" +BUNDLE_FILE="artifacts/harbor-bootstrap-v2.14.1-arm64.tar.zst" +DOCKER_CONFIG_PATH="" +PLATFORM="linux/arm64" + +while [[ $# -gt 0 ]]; do + case "$1" in + --images-file) + IMAGES_FILE="${2:?missing images file}" + shift 2 + ;; + --bundle-file) + BUNDLE_FILE="${2:?missing bundle file}" + shift 2 + ;; + --docker-config) + DOCKER_CONFIG_PATH="${2:?missing docker config path}" + shift 2 + ;; + --platform) + PLATFORM="${2:?missing platform}" + shift 2 + ;; + -h|--help) + cat <] [--bundle-file ] [--docker-config ] [--platform ] +USAGE + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac +done + +if [[ -n "${DOCKER_CONFIG_PATH}" ]]; then + export DOCKER_CONFIG="${DOCKER_CONFIG_PATH}" +fi + +mapfile -t IMAGES < <(grep -v '^[[:space:]]*#' "${IMAGES_FILE}" | sed '/^[[:space:]]*$/d') +if [[ ${#IMAGES[@]} -eq 0 ]]; then + echo "No images found in ${IMAGES_FILE}" >&2 + exit 1 +fi + +mkdir -p "$(dirname "${BUNDLE_FILE}")" +for image in "${IMAGES[@]}"; do + echo "Pulling ${image}" >&2 + docker pull --platform "${PLATFORM}" "${image}" >/dev/null + +done + +docker save "${IMAGES[@]}" | zstd -T0 -19 -o "${BUNDLE_FILE}" +echo "Wrote ${BUNDLE_FILE}" >&2 diff --git a/scripts/build_hecate_node_helper.sh b/scripts/build_hecate_node_helper.sh new file mode 100755 index 00000000..148f51c5 --- /dev/null +++ b/scripts/build_hecate_node_helper.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -euo pipefail + +IMAGE="registry.bstein.dev/bstein/hecate-node-helper:0.1.0" +DOCKER_CONFIG_PATH="" +PLATFORMS="linux/amd64,linux/arm64" +BUILDER_NAME="hecate-node-helper-builder" + +while [[ $# -gt 0 ]]; do + case "$1" in + --image) + IMAGE="${2:?missing image}" + shift 2 + ;; + --docker-config) + DOCKER_CONFIG_PATH="${2:?missing docker config path}" + shift 2 + ;; + --platforms) + PLATFORMS="${2:?missing platforms}" + shift 2 + ;; + --builder) + BUILDER_NAME="${2:?missing builder}" + shift 2 + ;; + -h|--help) + cat <] [--docker-config ] [--platforms ] [--builder ] +USAGE + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac +done + +if [[ -n "${DOCKER_CONFIG_PATH}" ]]; then + export DOCKER_CONFIG="${DOCKER_CONFIG_PATH}" +fi + +if ! docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then + docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use >/dev/null +else + docker buildx use "${BUILDER_NAME}" >/dev/null +fi + +docker buildx inspect --bootstrap >/dev/null +docker buildx build \ + --platform "${PLATFORMS}" \ + -f dockerfiles/Dockerfile.hecate-node-helper \ + -t "${IMAGE}" \ + --push \ + . diff --git a/scripts/cluster_power_console.sh b/scripts/cluster_power_console.sh new file mode 100755 index 00000000..6bc8cd7e --- /dev/null +++ b/scripts/cluster_power_console.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <] [--delegate-host ] [--allow-local] [recovery-script-options...] + +Purpose: + Friendly manual entrypoint for running Hecate from a remote console. + The canonical control host is titan-db by default so bundle/state handling stays in one place. + +Defaults: + --repo-dir \$HOME/Development/titan-iac + --delegate-host titan-db + +Examples: + scripts/cluster_power_console.sh status + scripts/cluster_power_console.sh prepare --execute + scripts/cluster_power_console.sh shutdown --execute + scripts/cluster_power_console.sh startup --execute --force-flux-branch main +USAGE +} + +REPO_DIR="${HOME}/Development/titan-iac" +DELEGATE_HOST="titan-db" +ALLOW_LOCAL=0 +REMOTE_REPO_DIR="${HECATE_REMOTE_REPO_DIR:-}" + +while [[ $# -gt 0 ]]; do + case "$1" in + --repo-dir) + REPO_DIR="${2:-}" + shift 2 + ;; + --delegate-host) + DELEGATE_HOST="${2:-}" + shift 2 + ;; + --allow-local) + ALLOW_LOCAL=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + break + ;; + esac +done + +if [[ $# -lt 1 ]]; then + usage + exit 1 +fi + +LOCAL_SCRIPT="${REPO_DIR}/scripts/cluster_power_recovery.sh" +CURRENT_HOST="$(hostname -s 2>/dev/null || hostname)" + +if [[ -x "${LOCAL_SCRIPT}" ]] && command -v kubectl >/dev/null 2>&1; then + if [[ "${ALLOW_LOCAL}" -eq 1 || "${CURRENT_HOST}" == "${DELEGATE_HOST}" ]]; then + exec "${LOCAL_SCRIPT}" "$@" + fi +fi + +if [[ -z "${DELEGATE_HOST}" ]]; then + echo "cluster-power-console: no delegate host configured" >&2 + exit 1 +fi + +quoted_args="$(printf '%q ' "$@")" +remote_prefix="" +if [[ -n "${REMOTE_REPO_DIR}" ]]; then + remote_prefix="HECATE_REPO_DIR=$(printf '%q' "${REMOTE_REPO_DIR}") " +fi +exec ssh -o BatchMode=yes -o ConnectTimeout=8 "${DELEGATE_HOST}" "${remote_prefix}~/hecate-tools/cluster_power_recovery.sh ${quoted_args}" diff --git a/scripts/cluster_power_recovery.sh b/scripts/cluster_power_recovery.sh new file mode 100755 index 00000000..5764accd --- /dev/null +++ b/scripts/cluster_power_recovery.sh @@ -0,0 +1,981 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_DIR="${HECATE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}" +BOOTSTRAP_DIR="${SCRIPT_DIR}/bootstrap" +CONFIG_FILE="${BOOTSTRAP_DIR}/recovery-config.env" +if [[ -f "${CONFIG_FILE}" ]]; then + # shellcheck disable=SC1090 + source "${CONFIG_FILE}" +fi +if [[ -z "${KUBECONFIG:-}" && -f "${SCRIPT_DIR}/kubeconfig" ]]; then + export KUBECONFIG="${SCRIPT_DIR}/kubeconfig" +fi + +usage() { + cat < [options] + +Options: + --execute Actually run commands (default is dry-run) + --expected-flux-branch Expected Flux source branch during startup checks (default: ${DEFAULT_FLUX_BRANCH:-main}) + --force-flux-branch Startup: patch flux-system GitRepository branch to this value + --skip-etcd-snapshot Shutdown: skip etcd snapshot before shutdown + --skip-drain Shutdown: skip worker drain during shutdown + --skip-local-bootstrap Startup: skip local bootstrap fallback applies + --skip-harbor-bootstrap Startup: skip Harbor recovery bootstrap stage + --skip-harbor-seed Startup: skip Harbor image seed/import stage + --skip-helper-prewarm Prepare/Shutdown/Startup: skip node-helper prewarm + --min-startup-battery Minimum UPS percent required before bootstrap (default: 35) + --ups-host UPS identifier for upsc (default: ups@localhost) + --ups-battery-key UPS battery key for upsc (default: battery.charge) + --recovery-state-file Recovery state file for outage-aware restart logic + --harbor-bundle-file Harbor bootstrap bundle on the control host + --harbor-target-node Node that should host Harbor during bootstrap (default: ${HARBOR_TARGET_NODE:-titan-05}) + --harbor-canary-image Harbor-backed image used for pull canary (default: ${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}) + --node-helper-image Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0}) + --bundle-http-port Temporary HTTP port used to serve bootstrap bundles (default: ${BUNDLE_HTTP_PORT:-8877}) + --api-wait-timeout Startup: Kubernetes API wait timeout (default: 600) + --drain-timeout Worker drain timeout for normal shutdown (default: 180) + --emergency-drain-timeout + Worker drain timeout for emergency fallback (default: 45) + --require-ups-battery Hard-fail startup if UPS battery cannot be read + -h, --help Show help + +Examples: + scripts/cluster_power_recovery.sh prepare --execute + scripts/cluster_power_recovery.sh harbor-seed --execute + scripts/cluster_power_recovery.sh status + scripts/cluster_power_recovery.sh shutdown --execute + scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main +USAGE +} + +MODE="${1:-}" +if [[ -z "${MODE}" || "${MODE}" == "-h" || "${MODE}" == "--help" ]]; then + usage + exit 0 +fi +shift || true + +case "${MODE}" in + prepare|status|harbor-seed|shutdown|startup) ;; + *) + echo "Unknown mode: ${MODE}" >&2 + usage + exit 1 + ;; +esac + +EXECUTE=0 +EXPECTED_FLUX_BRANCH="${DEFAULT_FLUX_BRANCH:-main}" +FORCE_FLUX_BRANCH="" +SKIP_ETCD_SNAPSHOT=0 +SKIP_DRAIN=0 +SKIP_LOCAL_BOOTSTRAP=0 +SKIP_HARBOR_BOOTSTRAP=0 +SKIP_HARBOR_SEED=0 +SKIP_HELPER_PREWARM=0 +UPS_HOST="ups@localhost" +UPS_BATTERY_KEY="battery.charge" +MIN_STARTUP_BATTERY=35 +REQUIRE_UPS_BATTERY=0 +DRAIN_TIMEOUT_SECONDS=180 +EMERGENCY_DRAIN_TIMEOUT_SECONDS=45 +API_WAIT_TIMEOUT_SECONDS=600 +BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}" +STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/hecate}" +RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state" +HARBOR_BUNDLE_FILE="${STATE_ROOT}/bundles/${HARBOR_BUNDLE_BASENAME:-harbor-bootstrap-v2.14.1-arm64.tar.zst}" +HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-titan-05}" +HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-titan-04}" +HARBOR_CANARY_IMAGE="${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}" +NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0}" +NODE_HELPER_NAMESPACE="${NODE_HELPER_NAMESPACE:-maintenance}" +NODE_HELPER_SERVICE_ACCOUNT="${NODE_HELPER_SERVICE_ACCOUNT:-default}" +REGISTRY_PULL_SECRET="${REGISTRY_PULL_SECRET:-harbor-regcred}" + +RECOVERY_PENDING=0 +STARTUP_ATTEMPTED_DURING_OUTAGE=0 +LAST_CHECKPOINT="none" +BUNDLE_SERVER_PID="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --execute) + EXECUTE=1 + shift + ;; + --expected-flux-branch) + EXPECTED_FLUX_BRANCH="${2:?missing branch}" + shift 2 + ;; + --force-flux-branch) + FORCE_FLUX_BRANCH="${2:?missing branch}" + shift 2 + ;; + --skip-etcd-snapshot) + SKIP_ETCD_SNAPSHOT=1 + shift + ;; + --skip-drain) + SKIP_DRAIN=1 + shift + ;; + --skip-local-bootstrap) + SKIP_LOCAL_BOOTSTRAP=1 + shift + ;; + --skip-harbor-bootstrap) + SKIP_HARBOR_BOOTSTRAP=1 + shift + ;; + --skip-harbor-seed) + SKIP_HARBOR_SEED=1 + shift + ;; + --skip-helper-prewarm) + SKIP_HELPER_PREWARM=1 + shift + ;; + --ups-host) + UPS_HOST="${2:?missing ups host}" + shift 2 + ;; + --ups-battery-key) + UPS_BATTERY_KEY="${2:?missing ups key}" + shift 2 + ;; + --min-startup-battery) + MIN_STARTUP_BATTERY="${2:?missing battery threshold}" + shift 2 + ;; + --require-ups-battery) + REQUIRE_UPS_BATTERY=1 + shift + ;; + --recovery-state-file) + RECOVERY_STATE_FILE="${2:?missing state file path}" + shift 2 + ;; + --harbor-bundle-file) + HARBOR_BUNDLE_FILE="${2:?missing bundle file path}" + shift 2 + ;; + --harbor-target-node) + HARBOR_TARGET_NODE="${2:?missing harbor target node}" + shift 2 + ;; + --harbor-canary-image) + HARBOR_CANARY_IMAGE="${2:?missing canary image}" + shift 2 + ;; + --node-helper-image) + NODE_HELPER_IMAGE="${2:?missing node helper image}" + shift 2 + ;; + --bundle-http-port) + BUNDLE_HTTP_PORT="${2:?missing bundle http port}" + shift 2 + ;; + --api-wait-timeout) + API_WAIT_TIMEOUT_SECONDS="${2:?missing api wait timeout}" + shift 2 + ;; + --drain-timeout) + DRAIN_TIMEOUT_SECONDS="${2:?missing drain timeout}" + shift 2 + ;; + --emergency-drain-timeout) + EMERGENCY_DRAIN_TIMEOUT_SECONDS="${2:?missing emergency drain timeout}" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +require_cmd() { + local cmd="$1" + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing required command: ${cmd}" >&2 + exit 1 + fi +} + +require_cmd kubectl +require_cmd bash +require_cmd base64 +require_cmd curl + +log() { echo "[cluster-power] $*"; } +warn() { echo "[cluster-power][warn] $*" >&2; } +die() { echo "[cluster-power][error] $*" >&2; exit 1; } + +run() { + if [[ "${EXECUTE}" -eq 1 ]]; then + log "EXEC: $*" + "$@" + else + log "DRY-RUN: $*" + fi +} + +run_shell() { + if [[ "${EXECUTE}" -eq 1 ]]; then + log "EXEC: $*" + bash -lc "$*" + else + log "DRY-RUN: $*" + fi +} + +apply_kustomization() { + local path="$1" + local full_path="${REPO_DIR}/${path}" + if [[ "${EXECUTE}" -eq 1 ]]; then + log "EXEC: kubectl kustomize ${full_path} --load-restrictor=LoadRestrictionsNone | kubectl apply -f -" + kubectl kustomize "${full_path}" --load-restrictor=LoadRestrictionsNone | kubectl apply -f - + else + log "DRY-RUN: kubectl kustomize ${full_path} --load-restrictor=LoadRestrictionsNone | kubectl apply -f -" + fi +} + +sanitize_name() { + printf '%s' "$1" | tr '[:upper:]' '[:lower:]' | tr -cs 'a-z0-9-' '-' +} + +state_dir() { + dirname "${RECOVERY_STATE_FILE}" +} + +load_recovery_state() { + RECOVERY_PENDING=0 + STARTUP_ATTEMPTED_DURING_OUTAGE=0 + LAST_CHECKPOINT="none" + [[ -f "${RECOVERY_STATE_FILE}" ]] || return 0 + while IFS='=' read -r key value; do + case "${key}" in + recovery_pending) RECOVERY_PENDING="${value}" ;; + startup_attempted) STARTUP_ATTEMPTED_DURING_OUTAGE="${value}" ;; + last_checkpoint) LAST_CHECKPOINT="${value}" ;; + esac + done < "${RECOVERY_STATE_FILE}" +} + +save_recovery_state() { + mkdir -p "$(state_dir)" + cat > "${RECOVERY_STATE_FILE}" </dev/null || true + LAST_CHECKPOINT="none" +} + +read_ups_battery() { + if ! command -v upsc >/dev/null 2>&1; then + return 1 + fi + local raw + raw="$(upsc "${UPS_HOST}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)" + [[ -n "${raw}" ]] || return 1 + raw="${raw%%.*}" + [[ "${raw}" =~ ^[0-9]+$ ]] || return 1 + printf '%s' "${raw}" +} + +ensure_minimum_battery_for_bootstrap() { + local battery + battery="$(read_ups_battery || true)" + if [[ -z "${battery}" ]]; then + if [[ "${REQUIRE_UPS_BATTERY}" -eq 1 ]]; then + warn "Unable to read UPS battery status and --require-ups-battery is set." + return 1 + fi + warn "Unable to read UPS battery status; continuing without hard battery gating." + return 0 + fi + log "ups-battery=${battery}%" + if (( battery < MIN_STARTUP_BATTERY )); then + warn "UPS battery ${battery}% below minimum startup threshold ${MIN_STARTUP_BATTERY}%." + return 1 + fi + return 0 +} + +report_flux_source_state() { + local flux_url flux_branch + flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)" + flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)" + [[ -n "${flux_url}" ]] && log "flux-source-url=${flux_url}" + if [[ -n "${flux_branch}" ]]; then + log "flux-source-branch=${flux_branch}" + if [[ "${MODE}" == "startup" && -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then + warn "Flux source branch is '${flux_branch}'. Expected '${EXPECTED_FLUX_BRANCH}' for canonical recovery." + fi + fi +} + +wait_for_api() { + local attempts=$(( API_WAIT_TIMEOUT_SECONDS / 5 )) + if (( attempts < 1 )); then + attempts=1 + fi + if [[ "${EXECUTE}" -eq 0 ]]; then + log "DRY-RUN: skipping live Kubernetes API wait" + return 0 + fi + local i + for i in $(seq 1 "${attempts}"); do + if kubectl version --request-timeout=5s >/dev/null 2>&1; then + return 0 + fi + sleep 5 + done + return 1 +} + +patch_flux_suspend_all() { + local value="$1" + local patch + patch=$(printf '{"spec":{"suspend":%s}}' "${value}") + + local ks_list hr_list + ks_list="$(kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' || true)" + hr_list="$(kubectl get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' || true)" + + while IFS= read -r k; do + [[ -z "${k}" ]] && continue + run kubectl -n flux-system patch kustomization "${k}" --type=merge -p "${patch}" + done <<< "${ks_list}" + + while IFS= read -r hr; do + [[ -z "${hr}" ]] && continue + local ns="${hr%%/*}" + local name="${hr##*/}" + run kubectl -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" + done <<< "${hr_list}" +} + +best_effort_scale_down_apps() { + local excludes='^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$' + local ns_list + ns_list="$(kubectl get ns -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')" + while IFS= read -r ns; do + [[ -z "${ns}" ]] && continue + if [[ "${ns}" =~ ${excludes} ]]; then + continue + fi + run_shell "kubectl -n ${ns} scale deployment --all --replicas=0 || true" + run_shell "kubectl -n ${ns} scale statefulset --all --replicas=0 || true" + done <<< "${ns_list}" +} + +discover_workers_csv() { + kubectl get nodes \ + -o custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\\.kubernetes\\.io/control-plane,MASTER:.metadata.labels.node-role\\.kubernetes\\.io/master \ + --no-headers \ + | awk '$2=="" && $3=="" {print $1}' \ + | paste -sd, - +} + +as_array_from_csv() { + local csv="$1" + local out_var="$2" + local old_ifs="${IFS}" + IFS=',' read -r -a _tmp <<< "${csv}" + IFS="${old_ifs}" + eval "${out_var}"'=( "${_tmp[@]}" )' +} + +best_effort_drain_workers() { + local timeout_seconds="$1" + shift || true + local workers=("$@") + local node + for node in "${workers[@]}"; do + [[ -z "${node}" ]] && continue + run kubectl cordon "${node}" + if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s"; then + continue + fi + warn "Gentle drain timed out for ${node}; retrying with --force." + if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force"; then + continue + fi + warn "Force drain timed out for ${node}; final attempt with --disable-eviction." + run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force --disable-eviction || true" + done +} + +wait_for_rollout() { + local namespace="$1" + local kind="$2" + local name="$3" + local timeout="$4" + if [[ "${EXECUTE}" -eq 0 ]]; then + log "DRY-RUN: kubectl -n ${namespace} rollout status ${kind}/${name} --timeout=${timeout}" + return 0 + fi + kubectl -n "${namespace}" rollout status "${kind}/${name}" --timeout="${timeout}" +} + +check_ingress_stack() { + kubectl get ingressclass traefik >/dev/null + wait_for_rollout traefik deployment traefik 5m +} + +check_longhorn_stack() { + wait_for_rollout longhorn-system daemonset longhorn-manager 10m + wait_for_rollout longhorn-system deployment longhorn-ui 10m +} + +check_vault_stack() { + wait_for_rollout vault statefulset vault 10m + if [[ "${EXECUTE}" -eq 1 ]]; then + kubectl -n vault exec vault-0 -- sh -ceu 'VAULT_ADDR=http://127.0.0.1:8200 vault status >/dev/null' + fi +} + +check_postgres_stack() { + wait_for_rollout postgres statefulset postgres 10m + if [[ "${EXECUTE}" -eq 1 ]]; then + kubectl -n postgres exec postgres-0 -c postgres -- sh -ceu 'pg_isready -h 127.0.0.1 -p 5432 >/dev/null' + fi +} + +check_gitea_stack() { + wait_for_rollout gitea deployment gitea 10m +} + +check_harbor_stack() { + wait_for_rollout harbor statefulset harbor-redis 10m + wait_for_rollout harbor deployment harbor-core 10m + wait_for_rollout harbor deployment harbor-jobservice 10m + wait_for_rollout harbor deployment harbor-portal 10m + wait_for_rollout harbor deployment harbor-registry 10m +} + +check_harbor_endpoint() { + if [[ "${EXECUTE}" -eq 0 ]]; then + log "DRY-RUN: curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/" + return 0 + fi + local code + code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)" + case "${code}" in + 200|401) + log "harbor-endpoint=http-${code}" + ;; + *) + die "Harbor endpoint check failed with HTTP ${code:-unknown}" + ;; + esac +} + +harbor_is_ready() { + kubectl -n harbor get deploy harbor-core harbor-jobservice harbor-portal harbor-registry >/dev/null 2>&1 || return 1 + local code + code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)" + [[ "${code}" == "200" || "${code}" == "401" ]] +} + +run_harbor_pull_canary() { + local pod="hecate-harbor-canary" + if [[ "${EXECUTE}" -eq 0 ]]; then + log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${HARBOR_CANARY_NODE}" + return 0 + fi + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true + cat </dev/null 2>&1 || true + kubectl -n "${NODE_HELPER_NAMESPACE}" wait --for=jsonpath='{.status.phase}'=Succeeded "pod/${pod}" --timeout=180s + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true +} + +run_helper_pod() { + local node="$1" + local purpose="$2" + local timeout_seconds="$3" + local script_content="$4" + local pod="hecate-$(sanitize_name "${purpose}")-$(date +%H%M%S)" + local encoded_script + encoded_script="$(printf '%s' "${script_content}" | base64 -w0)" + + if [[ "${EXECUTE}" -eq 0 ]]; then + log "DRY-RUN: helper pod ${pod} on ${node} for ${purpose}" + return 0 + fi + + cat </tmp/hecate-step.sh + chmod +x /tmp/hecate-step.sh + /tmp/hecate-step.sh +POD + + if ! kubectl -n "${NODE_HELPER_NAMESPACE}" wait --for=jsonpath='{.status.phase}'=Succeeded "pod/${pod}" --timeout="${timeout_seconds}s"; then + kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true + return 1 + fi + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true +} + +run_host_command_via_helper() { + local node="$1" + local purpose="$2" + local timeout_seconds="$3" + local host_command="$4" + local encoded_command + encoded_command="$(printf '%s' "${host_command}" | base64 -w0)" + local script_content + script_content=$(cat <