#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_DIR="${ANANKE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}" BOOTSTRAP_DIR="${SCRIPT_DIR}/bootstrap" CONFIG_FILE="${BOOTSTRAP_DIR}/recovery-config.env" if [[ -f "${CONFIG_FILE}" ]]; then # shellcheck disable=SC1090 source "${CONFIG_FILE}" fi if [[ -z "${KUBECONFIG:-}" && -f "${SCRIPT_DIR}/kubeconfig" ]]; then export KUBECONFIG="${SCRIPT_DIR}/kubeconfig" fi usage() { cat < [options] Options: --execute Actually run commands (default is dry-run) --shutdown-mode Shutdown behavior: host-poweroff or cluster-only (default: ${SHUTDOWN_MODE:-host-poweroff}) --expected-flux-branch Expected Flux source branch during startup checks (default: ${DEFAULT_FLUX_BRANCH:-main}) --expected-flux-url Expected Flux source URL during startup checks --allow-flux-source-mutation Required to allow --force-flux-url during startup --force-flux-url Startup: patch flux-system GitRepository URL to this value --force-flux-branch Startup: patch flux-system GitRepository branch to this value --skip-etcd-snapshot Shutdown: skip etcd snapshot before shutdown --skip-drain Shutdown: skip worker drain during shutdown --skip-local-bootstrap Startup: skip local bootstrap fallback applies --skip-harbor-bootstrap Startup: skip Harbor recovery bootstrap stage --skip-harbor-seed Startup: skip Harbor image seed/import stage --skip-helper-prewarm Prepare/Shutdown/Startup: skip node-helper prewarm --min-startup-battery Minimum UPS percent required before bootstrap (default: 35) --ups-host UPS identifier for upsc (default: ups@localhost) --ups-battery-key UPS battery key for upsc (default: battery.charge) --recovery-state-file Recovery state file for outage-aware restart logic --replica-snapshot-file File used to persist workload replica snapshot across shutdown/startup --harbor-bundle-file Harbor bootstrap bundle on the control host --harbor-target-node Node that should host Harbor during bootstrap (default: auto) --harbor-canary-node Node used for Harbor pull canary (default: auto) --harbor-host-label-key Node label key used to pin Harbor bootstrap workloads (default: ${HARBOR_HOST_LABEL_KEY:-ananke.bstein.dev/harbor-bootstrap}) --harbor-canary-image Harbor-backed image used for pull canary (default: ${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}) --node-helper-image Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/ananke-node-helper:0.1.0}) --bundle-http-port Temporary HTTP port used to serve bootstrap bundles (default: ${BUNDLE_HTTP_PORT:-8877}) --api-wait-timeout Startup: Kubernetes API wait timeout (default: 600) --drain-timeout Worker drain timeout for normal shutdown (default: 180) --emergency-drain-timeout Worker drain timeout for emergency fallback (default: 45) --flux-ready-timeout Startup: max time to wait for Flux kustomizations Ready (default: 1200) --startup-checklist-timeout Startup: max time to wait for external service checklist (default: 900) --startup-workload-timeout Startup: max time to wait for workload readiness checks (default: 900) --startup-stability-window Startup: continuous healthy window required before success (default: 180) --startup-stability-timeout Startup: max time allowed to achieve the healthy window (default: 900) --require-ups-battery Hard-fail startup if UPS battery cannot be read -h, --help Show help Examples: scripts/cluster_power_recovery.sh prepare --execute scripts/cluster_power_recovery.sh harbor-seed --execute scripts/cluster_power_recovery.sh status scripts/cluster_power_recovery.sh shutdown --execute scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main USAGE } MODE="${1:-}" if [[ -z "${MODE}" || "${MODE}" == "-h" || "${MODE}" == "--help" ]]; then usage exit 0 fi shift || true case "${MODE}" in prepare|status|harbor-seed|shutdown|startup) ;; *) echo "Unknown mode: ${MODE}" >&2 usage exit 1 ;; esac EXECUTE=0 SHUTDOWN_MODE="${SHUTDOWN_MODE:-host-poweroff}" EXPECTED_FLUX_BRANCH="${DEFAULT_FLUX_BRANCH:-main}" EXPECTED_FLUX_URL="${EXPECTED_FLUX_URL:-ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git}" ALLOW_FLUX_SOURCE_MUTATION=0 FORCE_FLUX_URL="" FORCE_FLUX_BRANCH="" SKIP_ETCD_SNAPSHOT=0 SKIP_DRAIN=0 SKIP_LOCAL_BOOTSTRAP=0 SKIP_HARBOR_BOOTSTRAP=0 SKIP_HARBOR_SEED=0 SKIP_HELPER_PREWARM=0 UPS_HOST="${UPS_HOST:-ups@localhost}" UPS_BATTERY_KEY="${UPS_BATTERY_KEY:-battery.charge}" MIN_STARTUP_BATTERY="${MIN_STARTUP_BATTERY:-35}" REQUIRE_UPS_BATTERY="${REQUIRE_UPS_BATTERY:-0}" DRAIN_TIMEOUT_SECONDS=180 EMERGENCY_DRAIN_TIMEOUT_SECONDS=45 API_WAIT_TIMEOUT_SECONDS=600 FLUX_READY_TIMEOUT_SECONDS="${FLUX_READY_TIMEOUT_SECONDS:-1200}" FLUX_READY_POLL_SECONDS="${FLUX_READY_POLL_SECONDS:-10}" STARTUP_CHECKLIST_TIMEOUT_SECONDS="${STARTUP_CHECKLIST_TIMEOUT_SECONDS:-900}" STARTUP_CHECKLIST_POLL_SECONDS="${STARTUP_CHECKLIST_POLL_SECONDS:-10}" STARTUP_WORKLOAD_TIMEOUT_SECONDS="${STARTUP_WORKLOAD_TIMEOUT_SECONDS:-900}" STARTUP_WORKLOAD_POLL_SECONDS="${STARTUP_WORKLOAD_POLL_SECONDS:-10}" STARTUP_STABILITY_WINDOW_SECONDS="${STARTUP_STABILITY_WINDOW_SECONDS:-180}" STARTUP_STABILITY_TIMEOUT_SECONDS="${STARTUP_STABILITY_TIMEOUT_SECONDS:-900}" STARTUP_STABILITY_POLL_SECONDS="${STARTUP_STABILITY_POLL_SECONDS:-10}" STARTUP_IGNORE_PODS_REGEX="${STARTUP_IGNORE_PODS_REGEX:-}" STARTUP_IGNORE_WORKLOADS_REGEX="${STARTUP_IGNORE_WORKLOADS_REGEX:-}" STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX="${STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system)$}" STARTUP_OPTIONAL_KUSTOMIZATIONS="${STARTUP_OPTIONAL_KUSTOMIZATIONS:-}" STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS:-10}" STARTUP_SERVICE_CHECKLIST="${STARTUP_SERVICE_CHECKLIST:-}" STARTUP_INCLUDE_INGRESS_CHECKS="${STARTUP_INCLUDE_INGRESS_CHECKS:-1}" STARTUP_INGRESS_ALLOWED_STATUSES="${STARTUP_INGRESS_ALLOWED_STATUSES:-200,301,302,307,308,401,403,404}" STARTUP_IGNORE_INGRESS_HOSTS_REGEX="${STARTUP_IGNORE_INGRESS_HOSTS_REGEX:-}" STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="${STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS:-10}" SHUTDOWN_NAMESPACE_EXCLUDES_REGEX="${SHUTDOWN_NAMESPACE_EXCLUDES_REGEX:-^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$}" REQUIRE_NONEMPTY_REPLICA_SNAPSHOT="${REQUIRE_NONEMPTY_REPLICA_SNAPSHOT:-1}" STARTUP_REQUIRE_MAIL_SAFEGUARDS="${STARTUP_REQUIRE_MAIL_SAFEGUARDS:-1}" MAIL_STARTUP_NAMESPACE="${MAIL_STARTUP_NAMESPACE:-mailu-mailserver}" MAIL_STARTUP_ENDPOINT_SERVICES="${MAIL_STARTUP_ENDPOINT_SERVICES:-mailu-front,mailu-postfix,mailu-dovecot}" MAIL_STARTUP_HOST="${MAIL_STARTUP_HOST:-mail.bstein.dev}" MAIL_STARTUP_TCP_PORTS="${MAIL_STARTUP_TCP_PORTS:-25,465,587,993,995}" MAIL_STARTUP_TCP_TIMEOUT_SECONDS="${MAIL_STARTUP_TCP_TIMEOUT_SECONDS:-3}" BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}" STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/ananke}" RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state" REPLICA_SNAPSHOT_FILE="${STATE_ROOT}/desired_workload_replicas.tsv" HARBOR_BUNDLE_FILE="${STATE_ROOT}/bundles/${HARBOR_BUNDLE_BASENAME:-harbor-bootstrap-v2.14.1-arm64.tar.zst}" HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-}" HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-}" HARBOR_HOST_LABEL_KEY="${HARBOR_HOST_LABEL_KEY:-ananke.bstein.dev/harbor-bootstrap}" HARBOR_CANARY_IMAGE="${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}" NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/ananke-node-helper:0.1.0}" NODE_HELPER_NAMESPACE="${NODE_HELPER_NAMESPACE:-maintenance}" NODE_HELPER_SERVICE_ACCOUNT="${NODE_HELPER_SERVICE_ACCOUNT:-default}" NODE_HELPER_PREWARM_DS="${NODE_HELPER_PREWARM_DS:-ananke-node-helper-prewarm}" REGISTRY_PULL_SECRET="${REGISTRY_PULL_SECRET:-harbor-regcred}" KEEP_PREWARM_DAEMONSET=0 RECOVERY_PENDING=0 STARTUP_ATTEMPTED_DURING_OUTAGE=0 LAST_CHECKPOINT="none" BUNDLE_SERVER_PID="" UPS_HOST_IN_USE="" while [[ $# -gt 0 ]]; do case "$1" in --execute) EXECUTE=1 shift ;; --shutdown-mode) SHUTDOWN_MODE="${2:?missing shutdown mode}" shift 2 ;; --expected-flux-branch) EXPECTED_FLUX_BRANCH="${2:?missing branch}" shift 2 ;; --expected-flux-url) EXPECTED_FLUX_URL="${2:?missing flux url}" shift 2 ;; --allow-flux-source-mutation) ALLOW_FLUX_SOURCE_MUTATION=1 shift ;; --force-flux-url) FORCE_FLUX_URL="${2:?missing flux url}" shift 2 ;; --force-flux-branch) FORCE_FLUX_BRANCH="${2:?missing branch}" shift 2 ;; --skip-etcd-snapshot) SKIP_ETCD_SNAPSHOT=1 shift ;; --skip-drain) SKIP_DRAIN=1 shift ;; --skip-local-bootstrap) SKIP_LOCAL_BOOTSTRAP=1 shift ;; --skip-harbor-bootstrap) SKIP_HARBOR_BOOTSTRAP=1 shift ;; --skip-harbor-seed) SKIP_HARBOR_SEED=1 shift ;; --skip-helper-prewarm) SKIP_HELPER_PREWARM=1 shift ;; --ups-host) UPS_HOST="${2:?missing ups host}" shift 2 ;; --ups-battery-key) UPS_BATTERY_KEY="${2:?missing ups key}" shift 2 ;; --min-startup-battery) MIN_STARTUP_BATTERY="${2:?missing battery threshold}" shift 2 ;; --require-ups-battery) REQUIRE_UPS_BATTERY=1 shift ;; --recovery-state-file) RECOVERY_STATE_FILE="${2:?missing state file path}" shift 2 ;; --replica-snapshot-file) REPLICA_SNAPSHOT_FILE="${2:?missing replica snapshot file path}" shift 2 ;; --harbor-bundle-file) HARBOR_BUNDLE_FILE="${2:?missing bundle file path}" shift 2 ;; --harbor-target-node) HARBOR_TARGET_NODE="${2:?missing harbor target node}" shift 2 ;; --harbor-canary-node) HARBOR_CANARY_NODE="${2:?missing harbor canary node}" shift 2 ;; --harbor-host-label-key) HARBOR_HOST_LABEL_KEY="${2:?missing harbor host label key}" shift 2 ;; --harbor-canary-image) HARBOR_CANARY_IMAGE="${2:?missing canary image}" shift 2 ;; --node-helper-image) NODE_HELPER_IMAGE="${2:?missing node helper image}" shift 2 ;; --bundle-http-port) BUNDLE_HTTP_PORT="${2:?missing bundle http port}" shift 2 ;; --api-wait-timeout) API_WAIT_TIMEOUT_SECONDS="${2:?missing api wait timeout}" shift 2 ;; --flux-ready-timeout) FLUX_READY_TIMEOUT_SECONDS="${2:?missing flux ready timeout}" shift 2 ;; --startup-checklist-timeout) STARTUP_CHECKLIST_TIMEOUT_SECONDS="${2:?missing startup checklist timeout}" shift 2 ;; --startup-workload-timeout) STARTUP_WORKLOAD_TIMEOUT_SECONDS="${2:?missing startup workload timeout}" shift 2 ;; --startup-stability-window) STARTUP_STABILITY_WINDOW_SECONDS="${2:?missing startup stability window}" shift 2 ;; --startup-stability-timeout) STARTUP_STABILITY_TIMEOUT_SECONDS="${2:?missing startup stability timeout}" shift 2 ;; --drain-timeout) DRAIN_TIMEOUT_SECONDS="${2:?missing drain timeout}" shift 2 ;; --emergency-drain-timeout) EMERGENCY_DRAIN_TIMEOUT_SECONDS="${2:?missing emergency drain timeout}" shift 2 ;; -h|--help) usage exit 0 ;; *) echo "Unknown option: $1" >&2 usage exit 1 ;; esac done case "${SHUTDOWN_MODE}" in host-poweroff|cluster-only) ;; *) echo "Invalid --shutdown-mode '${SHUTDOWN_MODE}'. Expected host-poweroff or cluster-only." >&2 exit 1 ;; esac if [[ -n "${FORCE_FLUX_URL}" && "${ALLOW_FLUX_SOURCE_MUTATION}" -ne 1 ]]; then echo "--force-flux-url requires --allow-flux-source-mutation (breakglass)." >&2 exit 1 fi require_cmd() { local cmd="$1" if ! command -v "${cmd}" >/dev/null 2>&1; then echo "Missing required command: ${cmd}" >&2 exit 1 fi } require_cmd kubectl require_cmd bash require_cmd base64 require_cmd curl log() { echo "[cluster-power] $*"; } warn() { echo "[cluster-power][warn] $*" >&2; } die() { echo "[cluster-power][error] $*" >&2; exit 1; } run() { if [[ "${EXECUTE}" -eq 1 ]]; then log "EXEC: $*" "$@" else log "DRY-RUN: $*" fi } run_shell() { if [[ "${EXECUTE}" -eq 1 ]]; then log "EXEC: $*" bash -lc "$*" else log "DRY-RUN: $*" fi } apply_kustomization() { local path="$1" local full_path="${REPO_DIR}/${path}" if [[ "${EXECUTE}" -eq 1 ]]; then log "EXEC: kubectl kustomize ${full_path} --load-restrictor=LoadRestrictionsNone | kubectl apply -f -" kubectl kustomize "${full_path}" --load-restrictor=LoadRestrictionsNone | kubectl apply -f - else log "DRY-RUN: kubectl kustomize ${full_path} --load-restrictor=LoadRestrictionsNone | kubectl apply -f -" fi } sanitize_name() { printf '%s' "$1" | tr '[:upper:]' '[:lower:]' | tr -cs 'a-z0-9-' '-' } state_dir() { dirname "${RECOVERY_STATE_FILE}" } load_recovery_state() { RECOVERY_PENDING=0 STARTUP_ATTEMPTED_DURING_OUTAGE=0 LAST_CHECKPOINT="none" [[ -f "${RECOVERY_STATE_FILE}" ]] || return 0 while IFS='=' read -r key value; do case "${key}" in recovery_pending) RECOVERY_PENDING="${value}" ;; startup_attempted) STARTUP_ATTEMPTED_DURING_OUTAGE="${value}" ;; last_checkpoint) LAST_CHECKPOINT="${value}" ;; esac done < "${RECOVERY_STATE_FILE}" } save_recovery_state() { [[ "${EXECUTE}" -eq 1 ]] || return 0 mkdir -p "$(state_dir)" cat > "${RECOVERY_STATE_FILE}" </dev/null || true LAST_CHECKPOINT="none" } sanitize_battery_percent() { local raw="$1" raw="${raw##*:}" raw="${raw//[[:space:]]/}" raw="${raw%%.*}" [[ "${raw}" =~ ^[0-9]+$ ]] || return 1 printf '%s' "${raw}" } candidate_ups_hosts() { local candidate name local -A seen=() if [[ -n "${UPS_HOST}" ]]; then seen["${UPS_HOST}"]=1 echo "${UPS_HOST}" fi while IFS= read -r name; do [[ -n "${name}" ]] || continue for candidate in "${name}@localhost" "${name}"; do [[ -n "${seen[${candidate}]+x}" ]] && continue seen["${candidate}"]=1 echo "${candidate}" done done < <(upsc -l 2>/dev/null || true) } read_ups_battery() { if ! command -v upsc >/dev/null 2>&1; then return 1 fi local host raw parsed while IFS= read -r host; do raw="$(upsc "${host}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)" [[ -n "${raw}" ]] || continue parsed="$(sanitize_battery_percent "${raw}" || true)" [[ -n "${parsed}" ]] || continue UPS_HOST_IN_USE="${host}" printf '%s' "${parsed}" return 0 done < <(candidate_ups_hosts) return 1 } ensure_minimum_battery_for_bootstrap() { local battery battery="$(read_ups_battery || true)" if [[ -z "${battery}" ]]; then if [[ "${REQUIRE_UPS_BATTERY}" -eq 1 ]]; then warn "Unable to read UPS battery status and --require-ups-battery is set." return 1 fi warn "Unable to read UPS battery status; continuing without hard battery gating." return 0 fi log "ups-battery=${battery}% host=${UPS_HOST_IN_USE:-${UPS_HOST}}" if (( battery < MIN_STARTUP_BATTERY )); then warn "UPS battery ${battery}% below minimum startup threshold ${MIN_STARTUP_BATTERY}%." return 1 fi return 0 } report_flux_source_state() { local flux_url flux_branch flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)" flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)" [[ -n "${flux_url}" ]] && log "flux-source-url=${flux_url}" if [[ -n "${flux_branch}" ]]; then log "flux-source-branch=${flux_branch}" fi } csv_has_value() { local csv="$1" local value="$2" local needle=",${value}," local haystack=",${csv}," [[ "${haystack}" == *"${needle}"* ]] } assert_flux_source_expected() { if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: skipping strict Flux source drift guard" return 0 fi local flux_url flux_branch flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)" flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)" [[ -n "${flux_url}" ]] || die "Unable to read Flux source URL from flux-system/gitrepository." [[ -n "${flux_branch}" ]] || die "Unable to read Flux source branch from flux-system/gitrepository." if [[ -n "${EXPECTED_FLUX_URL}" && "${flux_url}" != "${EXPECTED_FLUX_URL}" ]]; then die "Flux source URL drift detected: got '${flux_url}', expected '${EXPECTED_FLUX_URL}'. Refusing startup." fi if [[ -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then die "Flux source branch drift detected: got '${flux_branch}', expected '${EXPECTED_FLUX_BRANCH}'. Use --force-flux-branch to correct." fi } kustomization_is_optional() { local name="$1" [[ -n "${STARTUP_OPTIONAL_KUSTOMIZATIONS}" ]] || return 1 csv_has_value "${STARTUP_OPTIONAL_KUSTOMIZATIONS}" "${name}" } list_not_ready_kustomizations() { local rows line name ready message rows="$(kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io \ -o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,MESSAGE:.status.conditions[?(@.type=="Ready")].message' \ --no-headers 2>/dev/null || true)" [[ -n "${rows}" ]] || return 0 while IFS= read -r line; do [[ -n "${line}" ]] || continue name="$(awk '{print $1}' <<< "${line}")" ready="$(awk '{print $2}' <<< "${line}")" message="${line#${name} }" message="${message#${ready} }" if kustomization_is_optional "${name}"; then continue fi if [[ "${ready}" != "True" ]]; then printf '%s|%s\n' "${name}" "${message}" fi done <<< "${rows}" } trigger_flux_reconcile_all() { local now now="$(date --iso-8601=seconds)" run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="${now}" --overwrite if command -v flux >/dev/null 2>&1; then run flux reconcile source git flux-system -n flux-system --timeout=3m fi } heal_failed_flux_jobs() { local rows line ns name failed flux_owner helm_owner healed healed=0 rows="$(kubectl get jobs.batch -A \ -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,FAILED:.status.failed,FLUX_OWNER:.metadata.labels.kustomize\\.toolkit\\.fluxcd\\.io/name,HELM_OWNER:.metadata.labels.helm\\.toolkit\\.fluxcd\\.io/name \ --no-headers 2>/dev/null || true)" [[ -n "${rows}" ]] || return 1 while IFS= read -r line; do [[ -n "${line}" ]] || continue ns="$(awk '{print $1}' <<< "${line}")" name="$(awk '{print $2}' <<< "${line}")" failed="$(awk '{print $3}' <<< "${line}")" flux_owner="$(awk '{print $4}' <<< "${line}")" helm_owner="$(awk '{print $5}' <<< "${line}")" [[ "${failed}" != "" ]] || continue [[ "${failed}" =~ ^[0-9]+$ ]] || continue (( failed > 0 )) || continue if [[ "${flux_owner}" == "" && "${helm_owner}" == "" ]]; then continue fi warn "Deleting failed Flux-managed Job ${ns}/${name} to heal immutable-template drift." run kubectl -n "${ns}" delete job "${name}" --ignore-not-found healed=1 done <<< "${rows}" (( healed == 1 )) } wait_for_flux_kustomizations_ready() { if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: skipping wait for all Flux kustomizations Ready" return 0 fi local start now not_ready immutable_hits start="$(date +%s)" immutable_hits=0 while true; do not_ready="$(list_not_ready_kustomizations || true)" if [[ -z "${not_ready}" ]]; then log "flux-kustomizations=all-ready" return 0 fi log "flux-kustomizations-not-ready:" while IFS= read -r line; do [[ -n "${line}" ]] || continue log " ${line}" done <<< "${not_ready}" if grep -Eqi 'immutable|field is immutable|cannot patch.*Job|Job.*invalid' <<< "${not_ready}"; then if (( immutable_hits < 3 )); then immutable_hits=$(( immutable_hits + 1 )) warn "Detected immutable Job failure signal in Flux status. Attempting automated Job cleanup (${immutable_hits}/3)." if heal_failed_flux_jobs; then trigger_flux_reconcile_all fi fi fi now="$(date +%s)" if (( now - start >= FLUX_READY_TIMEOUT_SECONDS )); then die "Timed out waiting for Flux kustomizations Ready after ${FLUX_READY_TIMEOUT_SECONDS}s." fi sleep "${FLUX_READY_POLL_SECONDS}" done } default_startup_service_checklist() { cat <<'CHECKS' gitea|https://scm.bstein.dev/api/healthz|200|"status":"pass"|| grafana|https://metrics.bstein.dev/api/health|200|"database":"ok"|| harbor|https://registry.bstein.dev/v2/|200,401||| CHECKS } list_ingress_hosts() { kubectl get ingress -A -o jsonpath='{range .items[*]}{range .spec.rules[*]}{.host}{"\n"}{end}{end}' 2>/dev/null \ | sed '/^[[:space:]]*$/d' \ | sort -u } generated_ingress_service_checks() { local host while IFS= read -r host; do [[ -n "${host}" ]] || continue if [[ -n "${STARTUP_IGNORE_INGRESS_HOSTS_REGEX}" ]] && [[ "${host}" =~ ${STARTUP_IGNORE_INGRESS_HOSTS_REGEX} ]]; then continue fi printf 'ingress-%s|https://%s/|%s|||0|%s\n' "${host}" "${host}" "${STARTUP_INGRESS_ALLOWED_STATUSES}" "${STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS}" done < <(list_ingress_hosts) } startup_service_checklist_rows() { local base if [[ -n "${STARTUP_SERVICE_CHECKLIST}" ]]; then base="$(printf '%s' "${STARTUP_SERVICE_CHECKLIST}" | tr ';' '\n')" else base="$(default_startup_service_checklist)" fi printf '%s\n' "${base}" | sed '/^[[:space:]]*$/d' if [[ "${STARTUP_INCLUDE_INGRESS_CHECKS}" == "1" || "${STARTUP_INCLUDE_INGRESS_CHECKS}" == "true" ]]; then generated_ingress_service_checks fi } service_status_allowed() { local expected_csv="$1" local got="$2" local token IFS=',' read -r -a _statuses <<< "${expected_csv}" for token in "${_statuses[@]}"; do if [[ "${token}" == "${got}" ]]; then return 0 fi done return 1 } check_mail_safeguards_once() { local quiet="${1:-0}" local failures=0 namespace service host port ips local -a services=() ports=() if [[ "${STARTUP_REQUIRE_MAIL_SAFEGUARDS}" != "1" && "${STARTUP_REQUIRE_MAIL_SAFEGUARDS}" != "true" ]]; then return 0 fi namespace="${MAIL_STARTUP_NAMESPACE}" as_array_from_csv "${MAIL_STARTUP_ENDPOINT_SERVICES}" services for service in "${services[@]}"; do service="${service//[[:space:]]/}" [[ -n "${service}" ]] || continue ips="$(kubectl -n "${namespace}" get endpoints "${service}" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null || true)" if [[ -z "${ips//[[:space:]]/}" ]]; then if [[ "${quiet}" != "1" ]]; then warn "startup-check mail-endpoints ${namespace}/${service}: no ready endpoints." fi failures=1 fi done host="${MAIL_STARTUP_HOST}" if [[ -n "${host}" ]]; then as_array_from_csv "${MAIL_STARTUP_TCP_PORTS}" ports for port in "${ports[@]}"; do port="${port//[[:space:]]/}" [[ "${port}" =~ ^[0-9]+$ ]] || continue if ! timeout "${MAIL_STARTUP_TCP_TIMEOUT_SECONDS}" bash -lc "/dev/null 2>&1; then if [[ "${quiet}" != "1" ]]; then warn "startup-check mail-tcp ${host}:${port}: connect failed." fi failures=1 fi done fi (( failures == 0 )) } check_startup_service_checklist_once() { local rows row name url expected body_must body_must_not insecure timeout code rc local body_file failures failures=0 rows="$(startup_service_checklist_rows)" while IFS= read -r row; do [[ -n "${row}" ]] || continue IFS='|' read -r name url expected body_must body_must_not insecure timeout <<< "${row}" [[ -n "${name}" && -n "${url}" && -n "${expected}" ]] || continue [[ -n "${insecure}" ]] || insecure=0 [[ -n "${timeout}" ]] || timeout="${STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS}" body_file="$(mktemp)" rc=0 if [[ "${insecure}" == "1" || "${insecure}" == "true" ]]; then code="$(curl -ksS --max-time "${timeout}" -o "${body_file}" -w '%{http_code}' "${url}" || rc=$?)" else code="$(curl -sS --max-time "${timeout}" -o "${body_file}" -w '%{http_code}' "${url}" || rc=$?)" fi if (( rc != 0 )); then warn "startup-check ${name}: request failed (rc=${rc}) url=${url}" failures=1 rm -f "${body_file}" continue fi if ! service_status_allowed "${expected}" "${code}"; then warn "startup-check ${name}: expected status ${expected}, got ${code} url=${url}" failures=1 rm -f "${body_file}" continue fi if [[ -n "${body_must}" ]] && ! grep -Fq -- "${body_must}" "${body_file}"; then warn "startup-check ${name}: missing required body fragment '${body_must}'" failures=1 rm -f "${body_file}" continue fi if [[ -n "${body_must_not}" ]] && grep -Fq -- "${body_must_not}" "${body_file}"; then warn "startup-check ${name}: forbidden body fragment '${body_must_not}' present" failures=1 rm -f "${body_file}" continue fi rm -f "${body_file}" done <<< "${rows}" if ! check_mail_safeguards_once; then failures=1 fi (( failures == 0 )) } wait_for_startup_service_checklist() { if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: skipping startup external service checklist wait" return 0 fi local start now checklist_ok workloads_ok start="$(date +%s)" while true; do checklist_ok=0 workloads_ok=0 if check_startup_service_checklist_once; then checklist_ok=1 fi if list_unhealthy_workloads | sed '/^[[:space:]]*$/d' | grep -q .; then workloads_ok=0 else workloads_ok=1 fi if (( checklist_ok == 1 && workloads_ok == 1 )); then log "startup-checklist=all-passed" return 0 fi if (( workloads_ok == 0 )); then warn "startup-checklist: workloads are not fully ready yet." fi now="$(date +%s)" if (( now - start >= STARTUP_CHECKLIST_TIMEOUT_SECONDS )); then die "Timed out waiting for startup external checklist after ${STARTUP_CHECKLIST_TIMEOUT_SECONDS}s." fi sleep "${STARTUP_CHECKLIST_POLL_SECONDS}" done } collect_unstable_pods() { local rows rows="$(kubectl get pods -A --no-headers 2>/dev/null \ | awk '$4 ~ /(CrashLoopBackOff|ImagePullBackOff|ErrImagePull|CreateContainerConfigError|RunContainerError|InvalidImageName)/ {print $1 "/" $2 "|" $4}' || true)" if [[ -n "${STARTUP_IGNORE_PODS_REGEX}" ]]; then rows="$(printf '%s\n' "${rows}" | grep -Ev "${STARTUP_IGNORE_PODS_REGEX}" || true)" fi printf '%s' "${rows}" } wait_for_startup_stability_window() { if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: skipping startup stability window" return 0 fi local hard_deadline stable_since now unstable pods not_ready unhealthy_workloads stable_since="$(date +%s)" hard_deadline=$(( stable_since + STARTUP_STABILITY_TIMEOUT_SECONDS )) while true; do unstable=0 not_ready="$(list_not_ready_kustomizations || true)" if [[ -n "${not_ready}" ]]; then unstable=1 warn "stability-window: Flux kustomizations not ready." fi pods="$(collect_unstable_pods || true)" if [[ -n "${pods}" ]]; then unstable=1 warn "stability-window: unstable pods detected." while IFS= read -r line; do [[ -n "${line}" ]] || continue warn " ${line}" done <<< "${pods}" fi if ! check_startup_service_checklist_once; then unstable=1 warn "stability-window: external service checklist failed." fi unhealthy_workloads="$(list_unhealthy_workloads || true)" if [[ -n "${unhealthy_workloads}" ]]; then unstable=1 warn "stability-window: workloads not fully ready." while IFS= read -r line; do [[ -n "${line}" ]] || continue warn " ${line}" done <<< "${unhealthy_workloads}" fi now="$(date +%s)" if (( unstable == 0 )); then if (( now - stable_since >= STARTUP_STABILITY_WINDOW_SECONDS )); then log "startup-stability-window=passed (${STARTUP_STABILITY_WINDOW_SECONDS}s)" return 0 fi else stable_since="${now}" fi if (( now >= hard_deadline )); then die "Timed out waiting for startup stability window (${STARTUP_STABILITY_WINDOW_SECONDS}s healthy) within ${STARTUP_STABILITY_TIMEOUT_SECONDS}s." fi sleep "${STARTUP_STABILITY_POLL_SECONDS}" done } wait_for_api() { local attempts=$(( API_WAIT_TIMEOUT_SECONDS / 5 )) if (( attempts < 1 )); then attempts=1 fi if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: skipping live Kubernetes API wait" return 0 fi local i for i in $(seq 1 "${attempts}"); do if kubectl version --request-timeout=5s >/dev/null 2>&1; then return 0 fi sleep 5 done return 1 } patch_flux_suspend_all() { local value="$1" local patch patch=$(printf '{"spec":{"suspend":%s}}' "${value}") local ks_list hr_list ks_list="$(kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' || true)" hr_list="$(kubectl get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' || true)" while IFS= read -r k; do [[ -z "${k}" ]] && continue run kubectl -n flux-system patch kustomization "${k}" --type=merge -p "${patch}" done <<< "${ks_list}" while IFS= read -r hr; do [[ -z "${hr}" ]] && continue local ns="${hr%%/*}" local name="${hr##*/}" run kubectl -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" done <<< "${hr_list}" } shutdown_namespace_excluded() { local ns="$1" [[ "${ns}" =~ ${SHUTDOWN_NAMESPACE_EXCLUDES_REGEX} ]] } startup_workload_namespace_excluded() { local ns="$1" [[ "${ns}" =~ ${STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX} ]] } best_effort_scale_down_apps() { local ns_list ns ns_list="$(kubectl get ns -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')" while IFS= read -r ns; do [[ -z "${ns}" ]] && continue if shutdown_namespace_excluded "${ns}"; then continue fi run_shell "kubectl -n ${ns} scale deployment --all --replicas=0 || true" run_shell "kubectl -n ${ns} scale statefulset --all --replicas=0 || true" done <<< "${ns_list}" } save_workload_replica_snapshot() { local rows line ns kind name replicas if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: save workload replica snapshot to ${REPLICA_SNAPSHOT_FILE}" return 0 fi rows="$( { kubectl get deployment -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\tdeployment\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\n"}{end}' 2>/dev/null || true kubectl get statefulset -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\tstatefulset\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\n"}{end}' 2>/dev/null || true } | sed '/^[[:space:]]*$/d' )" mkdir -p "$(dirname "${REPLICA_SNAPSHOT_FILE}")" : > "${REPLICA_SNAPSHOT_FILE}" while IFS=$'\t' read -r ns kind name replicas; do [[ -n "${ns}" && -n "${kind}" && -n "${name}" && -n "${replicas}" ]] || continue shutdown_namespace_excluded "${ns}" && continue [[ "${replicas}" =~ ^[0-9]+$ ]] || continue (( replicas > 0 )) || continue printf '%s\t%s\t%s\t%s\n' "${ns}" "${kind}" "${name}" "${replicas}" >> "${REPLICA_SNAPSHOT_FILE}" done <<< "${rows}" log "replica-snapshot-file=${REPLICA_SNAPSHOT_FILE}" log "replica-snapshot-count=$(replica_snapshot_count)" } replica_snapshot_count() { if [[ -f "${REPLICA_SNAPSHOT_FILE}" ]]; then wc -l < "${REPLICA_SNAPSHOT_FILE}" | tr -d ' ' else printf '0' fi } restore_workload_replica_snapshot() { local ns kind name desired current if [[ "${RECOVERY_PENDING}" -ne 1 ]]; then log "Skipping replica restore because recovery_pending=0." return 0 fi if [[ ! -f "${REPLICA_SNAPSHOT_FILE}" ]]; then warn "Replica snapshot file not found at ${REPLICA_SNAPSHOT_FILE}; skipping replica restore." return 0 fi while IFS=$'\t' read -r ns kind name desired; do [[ -n "${ns}" && -n "${kind}" && -n "${name}" && -n "${desired}" ]] || continue [[ "${desired}" =~ ^[0-9]+$ ]] || continue (( desired > 0 )) || continue current="$(kubectl -n "${ns}" get "${kind}" "${name}" -o jsonpath='{.spec.replicas}' 2>/dev/null || true)" [[ -n "${current}" ]] || continue [[ "${current}" =~ ^[0-9]+$ ]] || current=0 if (( current == desired )); then continue fi run kubectl -n "${ns}" scale "${kind}" "${name}" --replicas="${desired}" done < "${REPLICA_SNAPSHOT_FILE}" mark_checkpoint startup_replicas_restored } restore_zero_scaled_helm_workloads() { local rows ns kind name local restored=0 rows="$( { kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\.helm\\.sh/release-name --no-headers 2>/dev/null \ | awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "" {printf "%s\tdeployment\t%s\n", $1, $2}' kubectl get statefulset -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\.helm\\.sh/release-name --no-headers 2>/dev/null \ | awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "" {printf "%s\tstatefulset\t%s\n", $1, $2}' } | sed '/^[[:space:]]*$/d' )" while IFS=$'\t' read -r ns kind name; do [[ -n "${ns}" && -n "${kind}" && -n "${name}" ]] || continue startup_workload_namespace_excluded "${ns}" && continue if [[ -n "${STARTUP_IGNORE_WORKLOADS_REGEX}" ]] && [[ "${ns}/${name}" =~ ${STARTUP_IGNORE_WORKLOADS_REGEX} ]]; then continue fi warn "Auto-heal: restoring zero-scaled Helm workload ${ns}/${kind}/${name} to replicas=1." run kubectl -n "${ns}" scale "${kind}" "${name}" --replicas=1 restored=$((restored + 1)) done <<< "${rows}" if (( restored > 0 )); then log "Auto-heal: restored ${restored} zero-scaled Helm workloads." mark_checkpoint startup_zero_scaled_helm_restored else log "Auto-heal: no zero-scaled Helm workloads detected." fi } list_unhealthy_workloads() { local rows line ns name desired ready available rows="$(kubectl get deployment -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas --no-headers 2>/dev/null || true)" while IFS= read -r line; do [[ -n "${line}" ]] || continue ns="$(awk '{print $1}' <<< "${line}")" name="$(awk '{print $2}' <<< "${line}")" desired="$(awk '{print $3}' <<< "${line}")" ready="$(awk '{print $4}' <<< "${line}")" available="$(awk '{print $5}' <<< "${line}")" startup_workload_namespace_excluded "${ns}" && continue [[ -n "${STARTUP_IGNORE_WORKLOADS_REGEX}" && "${ns}/${name}" =~ ${STARTUP_IGNORE_WORKLOADS_REGEX} ]] && continue [[ "${desired}" =~ ^[0-9]+$ ]] || desired=0 [[ "${ready}" =~ ^[0-9]+$ ]] || ready=0 [[ "${available}" =~ ^[0-9]+$ ]] || available=0 (( desired > 0 )) || continue if (( ready < desired || available < desired )); then printf '%s/deployment/%s|ready=%s available=%s desired=%s\n' "${ns}" "${name}" "${ready}" "${available}" "${desired}" fi done <<< "${rows}" rows="$(kubectl get statefulset -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas --no-headers 2>/dev/null || true)" while IFS= read -r line; do [[ -n "${line}" ]] || continue ns="$(awk '{print $1}' <<< "${line}")" name="$(awk '{print $2}' <<< "${line}")" desired="$(awk '{print $3}' <<< "${line}")" ready="$(awk '{print $4}' <<< "${line}")" startup_workload_namespace_excluded "${ns}" && continue [[ -n "${STARTUP_IGNORE_WORKLOADS_REGEX}" && "${ns}/${name}" =~ ${STARTUP_IGNORE_WORKLOADS_REGEX} ]] && continue [[ "${desired}" =~ ^[0-9]+$ ]] || desired=0 [[ "${ready}" =~ ^[0-9]+$ ]] || ready=0 (( desired > 0 )) || continue if (( ready < desired )); then printf '%s/statefulset/%s|ready=%s desired=%s\n' "${ns}" "${name}" "${ready}" "${desired}" fi done <<< "${rows}" } wait_for_startup_workloads_ready() { if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: skipping startup workload readiness checks" return 0 fi local start now unhealthy start="$(date +%s)" while true; do unhealthy="$(list_unhealthy_workloads || true)" if [[ -z "${unhealthy}" ]]; then log "startup-workloads=all-ready" return 0 fi warn "startup-workloads-not-ready:" while IFS= read -r line; do [[ -n "${line}" ]] || continue warn " ${line}" done <<< "${unhealthy}" now="$(date +%s)" if (( now - start >= STARTUP_WORKLOAD_TIMEOUT_SECONDS )); then die "Timed out waiting for startup workloads Ready after ${STARTUP_WORKLOAD_TIMEOUT_SECONDS}s." fi sleep "${STARTUP_WORKLOAD_POLL_SECONDS}" done } discover_workers_csv() { kubectl get nodes \ -o 'custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\.kubernetes\.io/control-plane,MASTER:.metadata.labels.node-role\.kubernetes\.io/master,READY:.status.conditions[?(@.type=="Ready")].status' \ --no-headers \ | awk '$2=="" && $3=="" && $4=="True" {print $1}' \ | paste -sd, - } node_is_ready() { local node="$1" [[ -n "${node}" ]] || return 1 local ready ready="$(kubectl get node "${node}" -o jsonpath='{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || true)" [[ "${ready}" == "True" ]] } select_ready_arm64_worker() { local rows node rows="$(kubectl get nodes -o 'custom-columns=NAME:.metadata.name,ARCH:.metadata.labels.kubernetes\.io/arch,WORKER:.metadata.labels.node-role\.kubernetes\.io/worker,HARDWARE:.metadata.labels.hardware,READY:.status.conditions[?(@.type=="Ready")].status' --no-headers 2>/dev/null || true)" [[ -n "${rows}" ]] || return 1 node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $4=="rpi5" && $5=="True" {print $1; exit}')" if [[ -n "${node}" ]]; then printf '%s' "${node}" return 0 fi node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $4=="rpi4" && $5=="True" {print $1; exit}')" if [[ -n "${node}" ]]; then printf '%s' "${node}" return 0 fi node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $5=="True" {print $1; exit}')" if [[ -n "${node}" ]]; then printf '%s' "${node}" return 0 fi return 1 } ensure_harbor_target_node() { if node_is_ready "${HARBOR_TARGET_NODE}"; then return 0 fi local fallback fallback="$(select_ready_arm64_worker || true)" [[ -n "${fallback}" ]] || die "No Ready arm64 worker available for Harbor bootstrap target." if [[ -n "${HARBOR_TARGET_NODE}" ]]; then warn "Configured harbor target node '${HARBOR_TARGET_NODE}' is not Ready; using '${fallback}' instead." else log "harbor-target-node auto-selected: ${fallback}" fi HARBOR_TARGET_NODE="${fallback}" } ensure_harbor_host_label() { [[ -n "${HARBOR_TARGET_NODE}" ]] || die "Harbor target node is not set." local labeled node labeled="$(kubectl get nodes -l "${HARBOR_HOST_LABEL_KEY}=true" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)" while IFS= read -r node; do [[ -z "${node}" ]] && continue [[ "${node}" == "${HARBOR_TARGET_NODE}" ]] && continue run kubectl label node "${node}" "${HARBOR_HOST_LABEL_KEY}-" done <<< "${labeled}" run kubectl label node "${HARBOR_TARGET_NODE}" "${HARBOR_HOST_LABEL_KEY}=true" --overwrite } as_array_from_csv() { local csv="$1" local out_var="$2" local old_ifs="${IFS}" IFS=',' read -r -a _tmp <<< "${csv}" IFS="${old_ifs}" eval "${out_var}"'=( "${_tmp[@]}" )' } best_effort_drain_workers() { local timeout_seconds="$1" shift || true local workers=("$@") local node for node in "${workers[@]}"; do [[ -z "${node}" ]] && continue run kubectl cordon "${node}" if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s"; then continue fi warn "Gentle drain timed out for ${node}; retrying with --force." if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force"; then continue fi warn "Force drain timed out for ${node}; final attempt with --disable-eviction." run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force --disable-eviction || true" done } wait_for_rollout() { local namespace="$1" local kind="$2" local name="$3" local timeout="$4" if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: kubectl -n ${namespace} rollout status ${kind}/${name} --timeout=${timeout}" return 0 fi kubectl -n "${namespace}" rollout status "${kind}/${name}" --timeout="${timeout}" } check_ingress_stack() { kubectl get ingressclass traefik >/dev/null wait_for_rollout traefik deployment traefik 5m } check_longhorn_stack() { wait_for_rollout longhorn-system daemonset longhorn-manager 10m wait_for_rollout longhorn-system deployment longhorn-ui 10m } check_vault_stack() { wait_for_rollout vault statefulset vault 10m if [[ "${EXECUTE}" -eq 1 ]]; then kubectl -n vault exec vault-0 -- sh -ceu 'VAULT_ADDR=http://127.0.0.1:8200 vault status >/dev/null' fi } check_postgres_stack() { wait_for_rollout postgres statefulset postgres 10m if [[ "${EXECUTE}" -eq 1 ]]; then kubectl -n postgres exec postgres-0 -c postgres -- sh -ceu 'pg_isready -h 127.0.0.1 -p 5432 >/dev/null' fi } check_gitea_stack() { wait_for_rollout gitea deployment gitea 10m } check_harbor_stack() { wait_for_rollout harbor statefulset harbor-redis 10m wait_for_rollout harbor deployment harbor-core 10m wait_for_rollout harbor deployment harbor-jobservice 10m wait_for_rollout harbor deployment harbor-portal 10m wait_for_rollout harbor deployment harbor-registry 10m } check_harbor_endpoint() { if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/" return 0 fi local code code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)" case "${code}" in 200|401) log "harbor-endpoint=http-${code}" ;; *) die "Harbor endpoint check failed with HTTP ${code:-unknown}" ;; esac } wait_for_pod_phase() { local namespace="$1" local pod="$2" local expected_phase="$3" local timeout_seconds="$4" local start now phase start="$(date +%s)" while true; do phase="$(kubectl -n "${namespace}" get pod "${pod}" -o jsonpath='{.status.phase}' 2>/dev/null || true)" if [[ "${phase}" == "${expected_phase}" ]]; then return 0 fi if [[ "${phase}" == "Failed" ]]; then return 1 fi now="$(date +%s)" if (( now - start >= timeout_seconds )); then return 1 fi sleep 2 done } harbor_is_ready() { kubectl -n harbor get deploy harbor-core harbor-jobservice harbor-portal harbor-registry >/dev/null 2>&1 || return 1 local code code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)" [[ "${code}" == "200" || "${code}" == "401" ]] } run_harbor_pull_canary() { local pod="ananke-harbor-canary" local canary_node="${HARBOR_CANARY_NODE}" if ! node_is_ready "${canary_node}"; then ensure_harbor_target_node canary_node="${HARBOR_TARGET_NODE}" if [[ -n "${HARBOR_CANARY_NODE}" ]]; then warn "Configured harbor canary node '${HARBOR_CANARY_NODE}' is not Ready; using '${canary_node}'." fi HARBOR_CANARY_NODE="${canary_node}" fi if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${canary_node}" return 0 fi timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true cat <&2 || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true return 1 fi timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true } run_helper_pod() { local node="$1" local purpose="$2" local timeout_seconds="$3" local script_content="$4" local pod="ananke-$(sanitize_name "${purpose}")-$(date +%H%M%S)" local encoded_script encoded_script="$(printf '%s' "${script_content}" | base64 -w0)" if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: helper pod ${pod} on ${node} for ${purpose}" return 0 fi cat </tmp/ananke-step.sh chmod +x /tmp/ananke-step.sh /tmp/ananke-step.sh POD if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded "${timeout_seconds}"; then kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true return 1 fi timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true } run_host_command_via_helper() { local node="$1" local purpose="$2" local timeout_seconds="$3" local host_command="$4" local encoded_command encoded_command="$(printf '%s' "${host_command}" | base64 -w0)" local script_content script_content=$(cat <