titan-iac/scripts/cluster_power_recovery.sh

567 lines
17 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<'USAGE'
Usage:
scripts/cluster_power_recovery.sh shutdown [options]
scripts/cluster_power_recovery.sh startup [options]
Options:
--execute Actually run commands (default is dry-run)
--ssh-user <user> SSH user for node commands (default: current SSH config user)
--control-planes <csv> Control plane hosts (default: titan-0a,titan-0b,titan-0c)
--workers <csv> Worker hosts (default: static atlas inventory, with API discovery when available)
--expected-flux-branch <name> Expected Flux source branch during startup checks (default: main)
--skip-etcd-snapshot Skip etcd snapshot before shutdown
--skip-drain Skip worker drain during shutdown
--skip-local-bootstrap Startup: skip local bootstrap fallback applies
--skip-harbor-bootstrap Startup: skip Harbor recovery bootstrap stage
--force-flux-branch <name> Startup: patch flux-system GitRepository branch to this value
--min-startup-battery <pct> Minimum UPS percent required before bootstrap (default: 35)
--ups-host <name> UPS identifier for upsc (default: ups@localhost)
--ups-battery-key <key> UPS battery key for upsc (default: battery.charge)
--recovery-state-file <path> Recovery state file for second-outage detection
--drain-timeout <seconds> Worker drain timeout for normal shutdown (default: 180)
--emergency-drain-timeout <seconds>
Worker drain timeout for emergency fallback (default: 45)
--require-ups-battery Hard-fail startup if UPS battery cannot be read
-h, --help Show help
Examples:
scripts/cluster_power_recovery.sh shutdown --execute
scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main
USAGE
}
MODE="${1:-}"
if [[ -z "${MODE}" || "${MODE}" == "-h" || "${MODE}" == "--help" ]]; then
usage
exit 0
fi
shift || true
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
if [[ -z "${KUBECONFIG:-}" && -f "${SCRIPT_DIR}/kubeconfig" ]]; then
export KUBECONFIG="${SCRIPT_DIR}/kubeconfig"
fi
if [[ "${MODE}" != "shutdown" && "${MODE}" != "startup" ]]; then
echo "Unknown mode: ${MODE}" >&2
usage
exit 1
fi
EXECUTE=0
SSH_USER=""
CONTROL_PLANES="titan-0a,titan-0b,titan-0c"
WORKERS=""
DEFAULT_WORKERS="titan-04,titan-05,titan-06,titan-07,titan-08,titan-09,titan-10,titan-11,titan-12,titan-13,titan-14,titan-15,titan-17,titan-18,titan-19,titan-20,titan-21,titan-22,titan-24"
EXPECTED_FLUX_BRANCH="main"
SKIP_ETCD_SNAPSHOT=0
SKIP_DRAIN=0
SKIP_LOCAL_BOOTSTRAP=0
SKIP_HARBOR_BOOTSTRAP=0
FORCE_FLUX_BRANCH=""
UPS_HOST="ups@localhost"
UPS_BATTERY_KEY="battery.charge"
RECOVERY_STATE_FILE="${HOME}/.local/state/cluster_power_recovery.state"
MIN_STARTUP_BATTERY=35
DRAIN_TIMEOUT_SECONDS=180
EMERGENCY_DRAIN_TIMEOUT_SECONDS=45
REQUIRE_UPS_BATTERY=0
RECOVERY_PENDING=0
STARTUP_ATTEMPTED_DURING_OUTAGE=0
while [[ $# -gt 0 ]]; do
case "$1" in
--execute)
EXECUTE=1
shift
;;
--ssh-user)
SSH_USER="${2:-}"
shift 2
;;
--control-planes)
CONTROL_PLANES="${2:-}"
shift 2
;;
--workers)
WORKERS="${2:-}"
shift 2
;;
--expected-flux-branch)
EXPECTED_FLUX_BRANCH="${2:-}"
shift 2
;;
--skip-etcd-snapshot)
SKIP_ETCD_SNAPSHOT=1
shift
;;
--skip-drain)
SKIP_DRAIN=1
shift
;;
--skip-local-bootstrap)
SKIP_LOCAL_BOOTSTRAP=1
shift
;;
--skip-harbor-bootstrap)
SKIP_HARBOR_BOOTSTRAP=1
shift
;;
--force-flux-branch)
FORCE_FLUX_BRANCH="${2:-}"
shift 2
;;
--ups-host)
UPS_HOST="${2:-}"
shift 2
;;
--ups-battery-key)
UPS_BATTERY_KEY="${2:-}"
shift 2
;;
--min-startup-battery)
MIN_STARTUP_BATTERY="${2:-}"
shift 2
;;
--recovery-state-file)
RECOVERY_STATE_FILE="${2:-}"
shift 2
;;
--drain-timeout)
DRAIN_TIMEOUT_SECONDS="${2:-}"
shift 2
;;
--emergency-drain-timeout)
EMERGENCY_DRAIN_TIMEOUT_SECONDS="${2:-}"
shift 2
;;
--require-ups-battery)
REQUIRE_UPS_BATTERY=1
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown option: $1" >&2
usage
exit 1
;;
esac
done
require_cmd() {
local cmd="$1"
if ! command -v "${cmd}" >/dev/null 2>&1; then
echo "Missing required command: ${cmd}" >&2
exit 1
fi
}
require_cmd kubectl
require_cmd ssh
log() { echo "[cluster-power] $*"; }
warn() { echo "[cluster-power][warn] $*" >&2; }
run() {
if [[ "${EXECUTE}" -eq 1 ]]; then
log "EXEC: $*"
"$@"
else
log "DRY-RUN: $*"
fi
}
run_shell() {
if [[ "${EXECUTE}" -eq 1 ]]; then
log "EXEC: $*"
bash -lc "$*"
else
log "DRY-RUN: $*"
fi
}
as_array_from_csv() {
local csv="$1"
local out_var="$2"
local old_ifs="${IFS}"
IFS=',' read -r -a _tmp <<< "${csv}"
IFS="${old_ifs}"
eval "${out_var}"'=( "${_tmp[@]}" )'
}
ssh_target() {
local node="$1"
if [[ -n "${SSH_USER}" ]]; then
printf "%s@%s" "${SSH_USER}" "${node}"
else
printf "%s" "${node}"
fi
}
discover_workers_csv() {
# Include every non-control-plane node by default (workers + accelerators).
kubectl get nodes \
-o custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\\.kubernetes\\.io/control-plane,MASTER:.metadata.labels.node-role\\.kubernetes\\.io/master \
--no-headers \
| awk '$2=="<none>" && $3=="<none>" {print $1}' \
| paste -sd, -
}
load_recovery_state() {
if [[ ! -f "${RECOVERY_STATE_FILE}" ]]; then
RECOVERY_PENDING=0
STARTUP_ATTEMPTED_DURING_OUTAGE=0
return 0
fi
while IFS='=' read -r key value; do
case "${key}" in
recovery_pending)
RECOVERY_PENDING="${value}"
;;
startup_attempted)
STARTUP_ATTEMPTED_DURING_OUTAGE="${value}"
;;
esac
done < "${RECOVERY_STATE_FILE}"
}
save_recovery_state() {
mkdir -p "$(dirname "${RECOVERY_STATE_FILE}")"
cat > "${RECOVERY_STATE_FILE}" <<EOF
recovery_pending=${1}
startup_attempted=${2}
EOF
}
clear_recovery_state() {
if [[ -f "${RECOVERY_STATE_FILE}" ]]; then
rm -f "${RECOVERY_STATE_FILE}"
fi
}
read_ups_battery() {
if ! command -v upsc >/dev/null 2>&1; then
return 1
fi
local raw
raw="$(upsc "${UPS_HOST}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)"
if [[ -z "${raw}" ]]; then
return 1
fi
# battery.charge can include units/decimals in some setups; normalize.
raw="${raw%%.*}"
if ! [[ "${raw}" =~ ^[0-9]+$ ]]; then
return 1
fi
echo "${raw}"
}
ensure_minimum_battery_for_bootstrap() {
local battery
battery="$(read_ups_battery || true)"
if [[ -z "${battery}" ]]; then
if [[ "${REQUIRE_UPS_BATTERY}" -eq 1 ]]; then
warn "Unable to read UPS battery status and --require-ups-battery is set."
return 1
fi
warn "Unable to read UPS battery status; continuing without hard battery gating."
return 0
fi
log "ups-battery=${battery}%"
if (( battery < MIN_STARTUP_BATTERY )); then
warn "UPS battery ${battery}% below minimum startup threshold ${MIN_STARTUP_BATTERY}%."
return 1
fi
return 0
}
emergency_shutdown_after_outage() {
warn "Entering outage-aware emergency shutdown path due insufficient startup budget."
patch_flux_suspend_all true || true
best_effort_scale_down_apps
# Give the cluster one short chance to drain, then force progress.
best_effort_drain_workers "${EMERGENCY_DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}"
stop_workers_agents "${WORKER_NODES[@]}"
stop_control_planes "${CONTROL_PLANE_NODES[@]}"
}
patch_flux_suspend_all() {
local value="$1"
local patch
patch=$(printf '{"spec":{"suspend":%s}}' "${value}")
local ks_list hr_list
ks_list="$(kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' || true)"
hr_list="$(kubectl get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' || true)"
while IFS= read -r k; do
[[ -z "${k}" ]] && continue
run kubectl -n flux-system patch kustomization "${k}" --type=merge -p "${patch}"
done <<< "${ks_list}"
while IFS= read -r hr; do
[[ -z "${hr}" ]] && continue
local ns="${hr%%/*}"
local name="${hr##*/}"
run kubectl -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}"
done <<< "${hr_list}"
}
report_flux_source_state() {
local flux_url flux_branch
flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)"
flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)"
if [[ -n "${flux_url}" ]]; then
log "flux-source-url=${flux_url}"
fi
if [[ -n "${flux_branch}" ]]; then
log "flux-source-branch=${flux_branch}"
if [[ "${MODE}" == "startup" && -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then
warn "Flux source branch is '${flux_branch}'. Expected '${EXPECTED_FLUX_BRANCH}' for canonical cold-start recovery. Use --force-flux-branch ${EXPECTED_FLUX_BRANCH} if needed."
fi
fi
}
wait_for_api() {
local attempts="${1:-90}"
local sleep_s="${2:-2}"
if [[ "${EXECUTE}" -eq 0 ]]; then
log "DRY-RUN: skipping live Kubernetes API wait"
return 0
fi
local i
for i in $(seq 1 "${attempts}"); do
if kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
sleep "${sleep_s}"
done
return 1
}
best_effort_scale_down_apps() {
local excludes='^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$'
local ns_list
ns_list="$(kubectl get ns -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')"
while IFS= read -r ns; do
[[ -z "${ns}" ]] && continue
if [[ "${ns}" =~ ${excludes} ]]; then
continue
fi
run_shell "kubectl -n ${ns} scale deployment --all --replicas=0 || true"
run_shell "kubectl -n ${ns} scale statefulset --all --replicas=0 || true"
done <<< "${ns_list}"
}
best_effort_drain_workers() {
local timeout_seconds="$1"
shift || true
local workers=("$@")
local node
for node in "${workers[@]}"; do
[[ -z "${node}" ]] && continue
run kubectl cordon "${node}"
if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s"; then
continue
fi
warn "Gentle drain timed out for ${node}; retrying with --force."
if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force"; then
continue
fi
warn "Force drain timed out for ${node}; final attempt with --disable-eviction."
run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force --disable-eviction || true"
done
}
stop_workers_agents() {
local workers=("$@")
local node target
for node in "${workers[@]}"; do
[[ -z "${node}" ]] && continue
target="$(ssh_target "${node}")"
run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl stop k3s-agent || true"
done
}
start_workers_agents() {
local workers=("$@")
local node target
for node in "${workers[@]}"; do
[[ -z "${node}" ]] && continue
target="$(ssh_target "${node}")"
run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl start k3s-agent || true"
done
}
stop_control_planes() {
local cps=("$@")
local node target
for node in "${cps[@]}"; do
[[ -z "${node}" ]] && continue
target="$(ssh_target "${node}")"
run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl stop k3s || true"
done
}
start_control_planes() {
local cps=("$@")
local node target
for node in "${cps[@]}"; do
[[ -z "${node}" ]] && continue
target="$(ssh_target "${node}")"
run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl start k3s || true"
done
}
take_etcd_snapshot() {
local cp="$1"
local target
target="$(ssh_target "${cp}")"
local ts
ts="$(date +%Y%m%d-%H%M%S)"
run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" \
"sudo k3s etcd-snapshot save --name pre-shutdown-${ts}"
}
bootstrap_local_minimal() {
# Local apply path to break Flux<->Gitea boot deadlock during cold-start recovery.
# Longhorn is applied before stateful workloads so astreae-backed PVCs can bind.
run kubectl apply -k infrastructure/core
run kubectl apply -k infrastructure/sources/helm
run kubectl apply -k infrastructure/longhorn/core
run kubectl apply -k infrastructure/metallb
run kubectl apply -k infrastructure/traefik
run kubectl apply -k infrastructure/vault-csi
run kubectl apply -k infrastructure/vault-injector
run kubectl apply -k services/vault
run kubectl apply -k infrastructure/postgres
run kubectl apply -k services/gitea
}
bootstrap_local_harbor() {
# Optional Harbor bootstrap stage for environments where Harbor is authoritative for images.
run kubectl apply -k services/harbor
}
resume_flux_and_reconcile() {
patch_flux_suspend_all false
if command -v flux >/dev/null 2>&1; then
run flux reconcile source git flux-system -n flux-system --timeout=3m
run flux reconcile kustomization core -n flux-system --with-source --timeout=5m
run flux reconcile kustomization helm -n flux-system --with-source --timeout=5m
run flux reconcile kustomization longhorn -n flux-system --with-source --timeout=15m
run flux reconcile kustomization metallb -n flux-system --with-source --timeout=5m
run flux reconcile kustomization traefik -n flux-system --with-source --timeout=5m
run flux reconcile kustomization vault-csi -n flux-system --with-source --timeout=5m
run flux reconcile kustomization vault-injector -n flux-system --with-source --timeout=5m
run flux reconcile kustomization vault -n flux-system --with-source --timeout=10m
run flux reconcile kustomization postgres -n flux-system --with-source --timeout=10m
run flux reconcile kustomization gitea -n flux-system --with-source --timeout=10m
run flux reconcile kustomization harbor -n flux-system --with-source --timeout=15m
else
local now
now="$(date --iso-8601=seconds)"
run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="${now}" --overwrite
fi
}
as_array_from_csv "${CONTROL_PLANES}" CONTROL_PLANE_NODES
if [[ -z "${WORKERS}" ]]; then
WORKERS="$(discover_workers_csv 2>/dev/null || true)"
if [[ -z "${WORKERS}" ]]; then
warn "Unable to auto-discover workers from the API; falling back to static atlas worker inventory."
WORKERS="${DEFAULT_WORKERS}"
fi
fi
as_array_from_csv "${WORKERS}" WORKER_NODES
load_recovery_state
log "mode=${MODE} execute=${EXECUTE}"
log "control-planes=${CONTROL_PLANES}"
log "workers=${WORKERS}"
log "recovery-state-file=${RECOVERY_STATE_FILE}"
log "recovery_pending=${RECOVERY_PENDING} startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}"
report_flux_source_state
if [[ "${MODE}" == "shutdown" ]]; then
save_recovery_state 1 0
if [[ "${SKIP_ETCD_SNAPSHOT}" -eq 0 ]]; then
take_etcd_snapshot "${CONTROL_PLANE_NODES[0]}"
else
warn "Skipping etcd snapshot by request."
fi
patch_flux_suspend_all true
best_effort_scale_down_apps
if [[ "${SKIP_DRAIN}" -eq 0 ]]; then
best_effort_drain_workers "${DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}"
else
warn "Skipping worker drain by request."
fi
stop_workers_agents "${WORKER_NODES[@]}"
stop_control_planes "${CONTROL_PLANE_NODES[@]}"
log "Shutdown flow complete."
exit 0
fi
# Startup mode
if [[ "${RECOVERY_PENDING}" -eq 1 ]]; then
if ! ensure_minimum_battery_for_bootstrap; then
if [[ "${STARTUP_ATTEMPTED_DURING_OUTAGE}" -eq 1 ]]; then
emergency_shutdown_after_outage
exit 1
fi
warn "Startup deferred due low battery after recent outage; marking for second-outage fallback."
save_recovery_state 1 1
exit 1
fi
save_recovery_state 1 1
fi
start_control_planes "${CONTROL_PLANE_NODES[@]}"
start_workers_agents "${WORKER_NODES[@]}"
if ! wait_for_api 120 2; then
warn "Kubernetes API did not become reachable in time."
exit 1
fi
if [[ -n "${FORCE_FLUX_BRANCH}" ]]; then
run kubectl -n flux-system patch gitrepository flux-system --type=merge \
-p "{\"spec\":{\"ref\":{\"branch\":\"${FORCE_FLUX_BRANCH}\"}}}"
fi
if [[ "${SKIP_LOCAL_BOOTSTRAP}" -eq 0 ]]; then
# If source is not ready, bootstrap critical pieces from local checkout first.
if ! kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q True; then
warn "Flux source not Ready; executing local bootstrap fallback path."
bootstrap_local_minimal
if [[ "${SKIP_HARBOR_BOOTSTRAP}" -eq 0 ]]; then
bootstrap_local_harbor
else
warn "Skipping Harbor bootstrap fallback by request."
fi
fi
else
warn "Skipping local bootstrap fallback by request."
fi
resume_flux_and_reconcile
clear_recovery_state
log "Startup flow complete."