From d880fac673fe51451d9eb34e555c037258bda963 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 6 Apr 2026 04:47:05 -0300 Subject: [PATCH] hecate: harden titan-24 cleanup and ups telemetry --- scripts/bootstrap/recovery-config.env | 14 + scripts/cluster_power_recovery.sh | 1045 ++++++++++++++++++------- 2 files changed, 775 insertions(+), 284 deletions(-) create mode 100644 scripts/bootstrap/recovery-config.env diff --git a/scripts/bootstrap/recovery-config.env b/scripts/bootstrap/recovery-config.env new file mode 100644 index 00000000..c2f789d9 --- /dev/null +++ b/scripts/bootstrap/recovery-config.env @@ -0,0 +1,14 @@ +CANONICAL_CONTROL_HOST="titan-db" +DEFAULT_FLUX_BRANCH="main" +STATE_SUBDIR=".local/share/hecate" +HARBOR_BUNDLE_BASENAME="harbor-bootstrap-v2.14.1-arm64.tar.zst" +HARBOR_TARGET_NODE="titan-05" +HARBOR_CANARY_NODE="titan-04" +HARBOR_CANARY_IMAGE="registry.bstein.dev/bstein/kubectl:1.35.0" +NODE_HELPER_IMAGE="registry.bstein.dev/bstein/hecate-node-helper:0.1.0" +NODE_HELPER_NAMESPACE="maintenance" +NODE_HELPER_SERVICE_ACCOUNT="default" +REGISTRY_PULL_SECRET="harbor-regcred" +BUNDLE_HTTP_PORT="8877" +UPS_HOST="pyrphoros@localhost" +UPS_BATTERY_KEY="battery.charge" diff --git a/scripts/cluster_power_recovery.sh b/scripts/cluster_power_recovery.sh index c6c806ff..9efafff1 100755 --- a/scripts/cluster_power_recovery.sh +++ b/scripts/cluster_power_recovery.sh @@ -1,27 +1,43 @@ #!/usr/bin/env bash set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_DIR="${HECATE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}" +BOOTSTRAP_DIR="${SCRIPT_DIR}/bootstrap" +CONFIG_FILE="${BOOTSTRAP_DIR}/recovery-config.env" +if [[ -f "${CONFIG_FILE}" ]]; then + # shellcheck disable=SC1090 + source "${CONFIG_FILE}" +fi +if [[ -z "${KUBECONFIG:-}" && -f "${SCRIPT_DIR}/kubeconfig" ]]; then + export KUBECONFIG="${SCRIPT_DIR}/kubeconfig" +fi + usage() { - cat <<'USAGE' + cat < [options] Options: --execute Actually run commands (default is dry-run) - --ssh-user SSH user for node commands (default: current SSH config user) - --control-planes Control plane hosts (default: titan-0a,titan-0b,titan-0c) - --workers Worker hosts (default: static atlas inventory, with API discovery when available) - --expected-flux-branch Expected Flux source branch during startup checks (default: main) - --skip-etcd-snapshot Skip etcd snapshot before shutdown - --skip-drain Skip worker drain during shutdown + --expected-flux-branch Expected Flux source branch during startup checks (default: ${DEFAULT_FLUX_BRANCH:-main}) + --force-flux-branch Startup: patch flux-system GitRepository branch to this value + --skip-etcd-snapshot Shutdown: skip etcd snapshot before shutdown + --skip-drain Shutdown: skip worker drain during shutdown --skip-local-bootstrap Startup: skip local bootstrap fallback applies --skip-harbor-bootstrap Startup: skip Harbor recovery bootstrap stage - --force-flux-branch Startup: patch flux-system GitRepository branch to this value + --skip-harbor-seed Startup: skip Harbor image seed/import stage + --skip-helper-prewarm Prepare/Shutdown/Startup: skip node-helper prewarm --min-startup-battery Minimum UPS percent required before bootstrap (default: 35) --ups-host UPS identifier for upsc (default: ups@localhost) --ups-battery-key UPS battery key for upsc (default: battery.charge) - --recovery-state-file Recovery state file for second-outage detection + --recovery-state-file Recovery state file for outage-aware restart logic + --harbor-bundle-file Harbor bootstrap bundle on the control host + --harbor-target-node Node that should host Harbor during bootstrap (default: ${HARBOR_TARGET_NODE:-titan-05}) + --harbor-canary-image Harbor-backed image used for pull canary (default: ${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}) + --node-helper-image Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0}) + --bundle-http-port Temporary HTTP port used to serve bootstrap bundles (default: ${BUNDLE_HTTP_PORT:-8877}) + --api-wait-timeout Startup: Kubernetes API wait timeout (default: 600) --drain-timeout Worker drain timeout for normal shutdown (default: 180) --emergency-drain-timeout Worker drain timeout for emergency fallback (default: 45) @@ -29,6 +45,9 @@ Options: -h, --help Show help Examples: + scripts/cluster_power_recovery.sh prepare --execute + scripts/cluster_power_recovery.sh harbor-seed --execute + scripts/cluster_power_recovery.sh status scripts/cluster_power_recovery.sh shutdown --execute scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main USAGE @@ -41,38 +60,48 @@ if [[ -z "${MODE}" || "${MODE}" == "-h" || "${MODE}" == "--help" ]]; then fi shift || true -SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" -if [[ -z "${KUBECONFIG:-}" && -f "${SCRIPT_DIR}/kubeconfig" ]]; then - export KUBECONFIG="${SCRIPT_DIR}/kubeconfig" -fi - -if [[ "${MODE}" != "shutdown" && "${MODE}" != "startup" ]]; then - echo "Unknown mode: ${MODE}" >&2 - usage - exit 1 -fi +case "${MODE}" in + prepare|status|harbor-seed|shutdown|startup) ;; + *) + echo "Unknown mode: ${MODE}" >&2 + usage + exit 1 + ;; +esac EXECUTE=0 -SSH_USER="" -CONTROL_PLANES="titan-0a,titan-0b,titan-0c" -WORKERS="" -DEFAULT_WORKERS="titan-04,titan-05,titan-06,titan-07,titan-08,titan-09,titan-10,titan-11,titan-12,titan-13,titan-14,titan-15,titan-17,titan-18,titan-19,titan-20,titan-21,titan-22,titan-24" -EXPECTED_FLUX_BRANCH="main" +EXPECTED_FLUX_BRANCH="${DEFAULT_FLUX_BRANCH:-main}" +FORCE_FLUX_BRANCH="" SKIP_ETCD_SNAPSHOT=0 SKIP_DRAIN=0 SKIP_LOCAL_BOOTSTRAP=0 SKIP_HARBOR_BOOTSTRAP=0 -FORCE_FLUX_BRANCH="" -UPS_HOST="ups@localhost" -UPS_BATTERY_KEY="battery.charge" -RECOVERY_STATE_FILE="${HOME}/.local/state/cluster_power_recovery.state" -MIN_STARTUP_BATTERY=35 +SKIP_HARBOR_SEED=0 +SKIP_HELPER_PREWARM=0 +UPS_HOST="${UPS_HOST:-ups@localhost}" +UPS_BATTERY_KEY="${UPS_BATTERY_KEY:-battery.charge}" +MIN_STARTUP_BATTERY="${MIN_STARTUP_BATTERY:-35}" +REQUIRE_UPS_BATTERY="${REQUIRE_UPS_BATTERY:-0}" DRAIN_TIMEOUT_SECONDS=180 EMERGENCY_DRAIN_TIMEOUT_SECONDS=45 -REQUIRE_UPS_BATTERY=0 +API_WAIT_TIMEOUT_SECONDS=600 +BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}" +STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/hecate}" +RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state" +HARBOR_BUNDLE_FILE="${STATE_ROOT}/bundles/${HARBOR_BUNDLE_BASENAME:-harbor-bootstrap-v2.14.1-arm64.tar.zst}" +HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-titan-05}" +HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-titan-04}" +HARBOR_CANARY_IMAGE="${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}" +NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0}" +NODE_HELPER_NAMESPACE="${NODE_HELPER_NAMESPACE:-maintenance}" +NODE_HELPER_SERVICE_ACCOUNT="${NODE_HELPER_SERVICE_ACCOUNT:-default}" +REGISTRY_PULL_SECRET="${REGISTRY_PULL_SECRET:-harbor-regcred}" RECOVERY_PENDING=0 STARTUP_ATTEMPTED_DURING_OUTAGE=0 +LAST_CHECKPOINT="none" +BUNDLE_SERVER_PID="" +UPS_HOST_IN_USE="" while [[ $# -gt 0 ]]; do case "$1" in @@ -80,20 +109,12 @@ while [[ $# -gt 0 ]]; do EXECUTE=1 shift ;; - --ssh-user) - SSH_USER="${2:-}" - shift 2 - ;; - --control-planes) - CONTROL_PLANES="${2:-}" - shift 2 - ;; - --workers) - WORKERS="${2:-}" - shift 2 - ;; --expected-flux-branch) - EXPECTED_FLUX_BRANCH="${2:-}" + EXPECTED_FLUX_BRANCH="${2:?missing branch}" + shift 2 + ;; + --force-flux-branch) + FORCE_FLUX_BRANCH="${2:?missing branch}" shift 2 ;; --skip-etcd-snapshot) @@ -112,38 +133,66 @@ while [[ $# -gt 0 ]]; do SKIP_HARBOR_BOOTSTRAP=1 shift ;; - --force-flux-branch) - FORCE_FLUX_BRANCH="${2:-}" - shift 2 + --skip-harbor-seed) + SKIP_HARBOR_SEED=1 + shift + ;; + --skip-helper-prewarm) + SKIP_HELPER_PREWARM=1 + shift ;; --ups-host) - UPS_HOST="${2:-}" + UPS_HOST="${2:?missing ups host}" shift 2 ;; --ups-battery-key) - UPS_BATTERY_KEY="${2:-}" + UPS_BATTERY_KEY="${2:?missing ups key}" shift 2 ;; --min-startup-battery) - MIN_STARTUP_BATTERY="${2:-}" - shift 2 - ;; - --recovery-state-file) - RECOVERY_STATE_FILE="${2:-}" - shift 2 - ;; - --drain-timeout) - DRAIN_TIMEOUT_SECONDS="${2:-}" - shift 2 - ;; - --emergency-drain-timeout) - EMERGENCY_DRAIN_TIMEOUT_SECONDS="${2:-}" + MIN_STARTUP_BATTERY="${2:?missing battery threshold}" shift 2 ;; --require-ups-battery) REQUIRE_UPS_BATTERY=1 shift ;; + --recovery-state-file) + RECOVERY_STATE_FILE="${2:?missing state file path}" + shift 2 + ;; + --harbor-bundle-file) + HARBOR_BUNDLE_FILE="${2:?missing bundle file path}" + shift 2 + ;; + --harbor-target-node) + HARBOR_TARGET_NODE="${2:?missing harbor target node}" + shift 2 + ;; + --harbor-canary-image) + HARBOR_CANARY_IMAGE="${2:?missing canary image}" + shift 2 + ;; + --node-helper-image) + NODE_HELPER_IMAGE="${2:?missing node helper image}" + shift 2 + ;; + --bundle-http-port) + BUNDLE_HTTP_PORT="${2:?missing bundle http port}" + shift 2 + ;; + --api-wait-timeout) + API_WAIT_TIMEOUT_SECONDS="${2:?missing api wait timeout}" + shift 2 + ;; + --drain-timeout) + DRAIN_TIMEOUT_SECONDS="${2:?missing drain timeout}" + shift 2 + ;; + --emergency-drain-timeout) + EMERGENCY_DRAIN_TIMEOUT_SECONDS="${2:?missing emergency drain timeout}" + shift 2 + ;; -h|--help) usage exit 0 @@ -165,10 +214,13 @@ require_cmd() { } require_cmd kubectl -require_cmd ssh +require_cmd bash +require_cmd base64 +require_cmd curl log() { echo "[cluster-power] $*"; } warn() { echo "[cluster-power][warn] $*" >&2; } +die() { echo "[cluster-power][error] $*" >&2; exit 1; } run() { if [[ "${EXECUTE}" -eq 1 ]]; then @@ -188,81 +240,101 @@ run_shell() { fi } -as_array_from_csv() { - local csv="$1" - local out_var="$2" - local old_ifs="${IFS}" - IFS=',' read -r -a _tmp <<< "${csv}" - IFS="${old_ifs}" - eval "${out_var}"'=( "${_tmp[@]}" )' -} - -ssh_target() { - local node="$1" - if [[ -n "${SSH_USER}" ]]; then - printf "%s@%s" "${SSH_USER}" "${node}" +apply_kustomization() { + local path="$1" + local full_path="${REPO_DIR}/${path}" + if [[ "${EXECUTE}" -eq 1 ]]; then + log "EXEC: kubectl kustomize ${full_path} --load-restrictor=LoadRestrictionsNone | kubectl apply -f -" + kubectl kustomize "${full_path}" --load-restrictor=LoadRestrictionsNone | kubectl apply -f - else - printf "%s" "${node}" + log "DRY-RUN: kubectl kustomize ${full_path} --load-restrictor=LoadRestrictionsNone | kubectl apply -f -" fi } -discover_workers_csv() { - # Include every non-control-plane node by default (workers + accelerators). - kubectl get nodes \ - -o custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\\.kubernetes\\.io/control-plane,MASTER:.metadata.labels.node-role\\.kubernetes\\.io/master \ - --no-headers \ - | awk '$2=="" && $3=="" {print $1}' \ - | paste -sd, - +sanitize_name() { + printf '%s' "$1" | tr '[:upper:]' '[:lower:]' | tr -cs 'a-z0-9-' '-' +} + +state_dir() { + dirname "${RECOVERY_STATE_FILE}" } load_recovery_state() { - if [[ ! -f "${RECOVERY_STATE_FILE}" ]]; then - RECOVERY_PENDING=0 - STARTUP_ATTEMPTED_DURING_OUTAGE=0 - return 0 - fi - + RECOVERY_PENDING=0 + STARTUP_ATTEMPTED_DURING_OUTAGE=0 + LAST_CHECKPOINT="none" + [[ -f "${RECOVERY_STATE_FILE}" ]] || return 0 while IFS='=' read -r key value; do case "${key}" in - recovery_pending) - RECOVERY_PENDING="${value}" - ;; - startup_attempted) - STARTUP_ATTEMPTED_DURING_OUTAGE="${value}" - ;; + recovery_pending) RECOVERY_PENDING="${value}" ;; + startup_attempted) STARTUP_ATTEMPTED_DURING_OUTAGE="${value}" ;; + last_checkpoint) LAST_CHECKPOINT="${value}" ;; esac done < "${RECOVERY_STATE_FILE}" } save_recovery_state() { - mkdir -p "$(dirname "${RECOVERY_STATE_FILE}")" - cat > "${RECOVERY_STATE_FILE}" < "${RECOVERY_STATE_FILE}" </dev/null || true + LAST_CHECKPOINT="none" +} + +sanitize_battery_percent() { + local raw="$1" + raw="${raw##*:}" + raw="${raw//[[:space:]]/}" + raw="${raw%%.*}" + [[ "${raw}" =~ ^[0-9]+$ ]] || return 1 + printf '%s' "${raw}" +} + +candidate_ups_hosts() { + local candidate name + local -A seen=() + if [[ -n "${UPS_HOST}" ]]; then + seen["${UPS_HOST}"]=1 + echo "${UPS_HOST}" fi + while IFS= read -r name; do + [[ -n "${name}" ]] || continue + for candidate in "${name}@localhost" "${name}"; do + [[ -n "${seen[${candidate}]+x}" ]] && continue + seen["${candidate}"]=1 + echo "${candidate}" + done + done < <(upsc -l 2>/dev/null || true) } read_ups_battery() { if ! command -v upsc >/dev/null 2>&1; then return 1 fi - local raw - raw="$(upsc "${UPS_HOST}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)" - if [[ -z "${raw}" ]]; then - return 1 - fi - # battery.charge can include units/decimals in some setups; normalize. - raw="${raw%%.*}" - if ! [[ "${raw}" =~ ^[0-9]+$ ]]; then - return 1 - fi - echo "${raw}" + local host raw parsed + while IFS= read -r host; do + raw="$(upsc "${host}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)" + [[ -n "${raw}" ]] || continue + parsed="$(sanitize_battery_percent "${raw}" || true)" + [[ -n "${parsed}" ]] || continue + UPS_HOST_IN_USE="${host}" + printf '%s' "${parsed}" + return 0 + done < <(candidate_ups_hosts) + return 1 } ensure_minimum_battery_for_bootstrap() { @@ -276,8 +348,7 @@ ensure_minimum_battery_for_bootstrap() { warn "Unable to read UPS battery status; continuing without hard battery gating." return 0 fi - - log "ups-battery=${battery}%" + log "ups-battery=${battery}% host=${UPS_HOST_IN_USE:-${UPS_HOST}}" if (( battery < MIN_STARTUP_BATTERY )); then warn "UPS battery ${battery}% below minimum startup threshold ${MIN_STARTUP_BATTERY}%." return 1 @@ -285,14 +356,36 @@ ensure_minimum_battery_for_bootstrap() { return 0 } -emergency_shutdown_after_outage() { - warn "Entering outage-aware emergency shutdown path due insufficient startup budget." - patch_flux_suspend_all true || true - best_effort_scale_down_apps - # Give the cluster one short chance to drain, then force progress. - best_effort_drain_workers "${EMERGENCY_DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}" - stop_workers_agents "${WORKER_NODES[@]}" - stop_control_planes "${CONTROL_PLANE_NODES[@]}" +report_flux_source_state() { + local flux_url flux_branch + flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)" + flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)" + [[ -n "${flux_url}" ]] && log "flux-source-url=${flux_url}" + if [[ -n "${flux_branch}" ]]; then + log "flux-source-branch=${flux_branch}" + if [[ "${MODE}" == "startup" && -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then + warn "Flux source branch is '${flux_branch}'. Expected '${EXPECTED_FLUX_BRANCH}' for canonical recovery." + fi + fi +} + +wait_for_api() { + local attempts=$(( API_WAIT_TIMEOUT_SECONDS / 5 )) + if (( attempts < 1 )); then + attempts=1 + fi + if [[ "${EXECUTE}" -eq 0 ]]; then + log "DRY-RUN: skipping live Kubernetes API wait" + return 0 + fi + local i + for i in $(seq 1 "${attempts}"); do + if kubectl version --request-timeout=5s >/dev/null 2>&1; then + return 0 + fi + sleep 5 + done + return 1 } patch_flux_suspend_all() { @@ -317,39 +410,6 @@ patch_flux_suspend_all() { done <<< "${hr_list}" } -report_flux_source_state() { - local flux_url flux_branch - flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)" - flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)" - - if [[ -n "${flux_url}" ]]; then - log "flux-source-url=${flux_url}" - fi - if [[ -n "${flux_branch}" ]]; then - log "flux-source-branch=${flux_branch}" - if [[ "${MODE}" == "startup" && -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then - warn "Flux source branch is '${flux_branch}'. Expected '${EXPECTED_FLUX_BRANCH}' for canonical cold-start recovery. Use --force-flux-branch ${EXPECTED_FLUX_BRANCH} if needed." - fi - fi -} - -wait_for_api() { - local attempts="${1:-90}" - local sleep_s="${2:-2}" - if [[ "${EXECUTE}" -eq 0 ]]; then - log "DRY-RUN: skipping live Kubernetes API wait" - return 0 - fi - local i - for i in $(seq 1 "${attempts}"); do - if kubectl version --request-timeout=5s >/dev/null 2>&1; then - return 0 - fi - sleep "${sleep_s}" - done - return 1 -} - best_effort_scale_down_apps() { local excludes='^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$' local ns_list @@ -364,6 +424,23 @@ best_effort_scale_down_apps() { done <<< "${ns_list}" } +discover_workers_csv() { + kubectl get nodes \ + -o custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\\.kubernetes\\.io/control-plane,MASTER:.metadata.labels.node-role\\.kubernetes\\.io/master \ + --no-headers \ + | awk '$2=="" && $3=="" {print $1}' \ + | paste -sd, - +} + +as_array_from_csv() { + local csv="$1" + local out_var="$2" + local old_ifs="${IFS}" + IFS=',' read -r -a _tmp <<< "${csv}" + IFS="${old_ifs}" + eval "${out_var}"'=( "${_tmp[@]}" )' +} + best_effort_drain_workers() { local timeout_seconds="$1" shift || true @@ -384,183 +461,583 @@ best_effort_drain_workers() { done } -stop_workers_agents() { - local workers=("$@") - local node target - for node in "${workers[@]}"; do - [[ -z "${node}" ]] && continue - target="$(ssh_target "${node}")" - run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl stop k3s-agent || true" +wait_for_rollout() { + local namespace="$1" + local kind="$2" + local name="$3" + local timeout="$4" + if [[ "${EXECUTE}" -eq 0 ]]; then + log "DRY-RUN: kubectl -n ${namespace} rollout status ${kind}/${name} --timeout=${timeout}" + return 0 + fi + kubectl -n "${namespace}" rollout status "${kind}/${name}" --timeout="${timeout}" +} + +check_ingress_stack() { + kubectl get ingressclass traefik >/dev/null + wait_for_rollout traefik deployment traefik 5m +} + +check_longhorn_stack() { + wait_for_rollout longhorn-system daemonset longhorn-manager 10m + wait_for_rollout longhorn-system deployment longhorn-ui 10m +} + +check_vault_stack() { + wait_for_rollout vault statefulset vault 10m + if [[ "${EXECUTE}" -eq 1 ]]; then + kubectl -n vault exec vault-0 -- sh -ceu 'VAULT_ADDR=http://127.0.0.1:8200 vault status >/dev/null' + fi +} + +check_postgres_stack() { + wait_for_rollout postgres statefulset postgres 10m + if [[ "${EXECUTE}" -eq 1 ]]; then + kubectl -n postgres exec postgres-0 -c postgres -- sh -ceu 'pg_isready -h 127.0.0.1 -p 5432 >/dev/null' + fi +} + +check_gitea_stack() { + wait_for_rollout gitea deployment gitea 10m +} + +check_harbor_stack() { + wait_for_rollout harbor statefulset harbor-redis 10m + wait_for_rollout harbor deployment harbor-core 10m + wait_for_rollout harbor deployment harbor-jobservice 10m + wait_for_rollout harbor deployment harbor-portal 10m + wait_for_rollout harbor deployment harbor-registry 10m +} + +check_harbor_endpoint() { + if [[ "${EXECUTE}" -eq 0 ]]; then + log "DRY-RUN: curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/" + return 0 + fi + local code + code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)" + case "${code}" in + 200|401) + log "harbor-endpoint=http-${code}" + ;; + *) + die "Harbor endpoint check failed with HTTP ${code:-unknown}" + ;; + esac +} + +wait_for_pod_phase() { + local namespace="$1" + local pod="$2" + local expected_phase="$3" + local timeout_seconds="$4" + local start now phase + start="$(date +%s)" + while true; do + phase="$(kubectl -n "${namespace}" get pod "${pod}" -o jsonpath='{.status.phase}' 2>/dev/null || true)" + if [[ "${phase}" == "${expected_phase}" ]]; then + return 0 + fi + if [[ "${phase}" == "Failed" ]]; then + return 1 + fi + now="$(date +%s)" + if (( now - start >= timeout_seconds )); then + return 1 + fi + sleep 2 done } -start_workers_agents() { - local workers=("$@") - local node target - for node in "${workers[@]}"; do - [[ -z "${node}" ]] && continue - target="$(ssh_target "${node}")" - run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl start k3s-agent || true" - done +harbor_is_ready() { + kubectl -n harbor get deploy harbor-core harbor-jobservice harbor-portal harbor-registry >/dev/null 2>&1 || return 1 + local code + code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)" + [[ "${code}" == "200" || "${code}" == "401" ]] } -stop_control_planes() { - local cps=("$@") - local node target - for node in "${cps[@]}"; do - [[ -z "${node}" ]] && continue - target="$(ssh_target "${node}")" - run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl stop k3s || true" - done +run_harbor_pull_canary() { + local pod="hecate-harbor-canary" + if [[ "${EXECUTE}" -eq 0 ]]; then + log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${HARBOR_CANARY_NODE}" + return 0 + fi + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true + cat <&2 || true + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true + return 1 + fi + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true } -start_control_planes() { - local cps=("$@") - local node target - for node in "${cps[@]}"; do - [[ -z "${node}" ]] && continue - target="$(ssh_target "${node}")" - run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl start k3s || true" - done +run_helper_pod() { + local node="$1" + local purpose="$2" + local timeout_seconds="$3" + local script_content="$4" + local pod="hecate-$(sanitize_name "${purpose}")-$(date +%H%M%S)" + local encoded_script + encoded_script="$(printf '%s' "${script_content}" | base64 -w0)" + + if [[ "${EXECUTE}" -eq 0 ]]; then + log "DRY-RUN: helper pod ${pod} on ${node} for ${purpose}" + return 0 + fi + + cat </tmp/hecate-step.sh + chmod +x /tmp/hecate-step.sh + /tmp/hecate-step.sh +POD + + if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded "${timeout_seconds}"; then + kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true + return 1 + fi + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true } -take_etcd_snapshot() { - local cp="$1" - local target - target="$(ssh_target "${cp}")" - local ts - ts="$(date +%Y%m%d-%H%M%S)" - run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" \ - "sudo k3s etcd-snapshot save --name pre-shutdown-${ts}" +run_host_command_via_helper() { + local node="$1" + local purpose="$2" + local timeout_seconds="$3" + local host_command="$4" + local encoded_command + encoded_command="$(printf '%s' "${host_command}" | base64 -w0)" + local script_content + script_content=$(cat <