From 65de56b2aca1bf06cf99a3d542183251b7e30517 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 6 Apr 2026 04:47:05 -0300 Subject: [PATCH] hecate: harden titan-24 cleanup and ups telemetry --- scripts/bootstrap/recovery-config.env | 2 + scripts/cluster_power_recovery.sh | 90 ++++++++++++++++++++++----- 2 files changed, 78 insertions(+), 14 deletions(-) diff --git a/scripts/bootstrap/recovery-config.env b/scripts/bootstrap/recovery-config.env index 2db3deaa..c2f789d9 100644 --- a/scripts/bootstrap/recovery-config.env +++ b/scripts/bootstrap/recovery-config.env @@ -10,3 +10,5 @@ NODE_HELPER_NAMESPACE="maintenance" NODE_HELPER_SERVICE_ACCOUNT="default" REGISTRY_PULL_SECRET="harbor-regcred" BUNDLE_HTTP_PORT="8877" +UPS_HOST="pyrphoros@localhost" +UPS_BATTERY_KEY="battery.charge" diff --git a/scripts/cluster_power_recovery.sh b/scripts/cluster_power_recovery.sh index 5764accd..9efafff1 100755 --- a/scripts/cluster_power_recovery.sh +++ b/scripts/cluster_power_recovery.sh @@ -78,10 +78,10 @@ SKIP_LOCAL_BOOTSTRAP=0 SKIP_HARBOR_BOOTSTRAP=0 SKIP_HARBOR_SEED=0 SKIP_HELPER_PREWARM=0 -UPS_HOST="ups@localhost" -UPS_BATTERY_KEY="battery.charge" -MIN_STARTUP_BATTERY=35 -REQUIRE_UPS_BATTERY=0 +UPS_HOST="${UPS_HOST:-ups@localhost}" +UPS_BATTERY_KEY="${UPS_BATTERY_KEY:-battery.charge}" +MIN_STARTUP_BATTERY="${MIN_STARTUP_BATTERY:-35}" +REQUIRE_UPS_BATTERY="${REQUIRE_UPS_BATTERY:-0}" DRAIN_TIMEOUT_SECONDS=180 EMERGENCY_DRAIN_TIMEOUT_SECONDS=45 API_WAIT_TIMEOUT_SECONDS=600 @@ -101,6 +101,7 @@ RECOVERY_PENDING=0 STARTUP_ATTEMPTED_DURING_OUTAGE=0 LAST_CHECKPOINT="none" BUNDLE_SERVER_PID="" +UPS_HOST_IN_USE="" while [[ $# -gt 0 ]]; do case "$1" in @@ -273,6 +274,7 @@ load_recovery_state() { } save_recovery_state() { + [[ "${EXECUTE}" -eq 1 ]] || return 0 mkdir -p "$(state_dir)" cat > "${RECOVERY_STATE_FILE}" </dev/null || true LAST_CHECKPOINT="none" } +sanitize_battery_percent() { + local raw="$1" + raw="${raw##*:}" + raw="${raw//[[:space:]]/}" + raw="${raw%%.*}" + [[ "${raw}" =~ ^[0-9]+$ ]] || return 1 + printf '%s' "${raw}" +} + +candidate_ups_hosts() { + local candidate name + local -A seen=() + if [[ -n "${UPS_HOST}" ]]; then + seen["${UPS_HOST}"]=1 + echo "${UPS_HOST}" + fi + while IFS= read -r name; do + [[ -n "${name}" ]] || continue + for candidate in "${name}@localhost" "${name}"; do + [[ -n "${seen[${candidate}]+x}" ]] && continue + seen["${candidate}"]=1 + echo "${candidate}" + done + done < <(upsc -l 2>/dev/null || true) +} + read_ups_battery() { if ! command -v upsc >/dev/null 2>&1; then return 1 fi - local raw - raw="$(upsc "${UPS_HOST}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)" - [[ -n "${raw}" ]] || return 1 - raw="${raw%%.*}" - [[ "${raw}" =~ ^[0-9]+$ ]] || return 1 - printf '%s' "${raw}" + local host raw parsed + while IFS= read -r host; do + raw="$(upsc "${host}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)" + [[ -n "${raw}" ]] || continue + parsed="$(sanitize_battery_percent "${raw}" || true)" + [[ -n "${parsed}" ]] || continue + UPS_HOST_IN_USE="${host}" + printf '%s' "${parsed}" + return 0 + done < <(candidate_ups_hosts) + return 1 } ensure_minimum_battery_for_bootstrap() { @@ -314,7 +348,7 @@ ensure_minimum_battery_for_bootstrap() { warn "Unable to read UPS battery status; continuing without hard battery gating." return 0 fi - log "ups-battery=${battery}%" + log "ups-battery=${battery}% host=${UPS_HOST_IN_USE:-${UPS_HOST}}" if (( battery < MIN_STARTUP_BATTERY )); then warn "UPS battery ${battery}% below minimum startup threshold ${MIN_STARTUP_BATTERY}%." return 1 @@ -492,6 +526,29 @@ check_harbor_endpoint() { esac } +wait_for_pod_phase() { + local namespace="$1" + local pod="$2" + local expected_phase="$3" + local timeout_seconds="$4" + local start now phase + start="$(date +%s)" + while true; do + phase="$(kubectl -n "${namespace}" get pod "${pod}" -o jsonpath='{.status.phase}' 2>/dev/null || true)" + if [[ "${phase}" == "${expected_phase}" ]]; then + return 0 + fi + if [[ "${phase}" == "Failed" ]]; then + return 1 + fi + now="$(date +%s)" + if (( now - start >= timeout_seconds )); then + return 1 + fi + sleep 2 + done +} + harbor_is_ready() { kubectl -n harbor get deploy harbor-core harbor-jobservice harbor-portal harbor-registry >/dev/null 2>&1 || return 1 local code @@ -525,8 +582,12 @@ spec: imagePullPolicy: Always command: ["sh", "-ceu", "echo harbor-canary-ok"] CANARY - kubectl -n "${NODE_HELPER_NAMESPACE}" wait --for=condition=Ready "pod/${pod}" --timeout=180s >/dev/null 2>&1 || true - kubectl -n "${NODE_HELPER_NAMESPACE}" wait --for=jsonpath='{.status.phase}'=Succeeded "pod/${pod}" --timeout=180s + if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded 180; then + kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true + timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true + return 1 + fi timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true } @@ -575,7 +636,7 @@ spec: /tmp/hecate-step.sh POD - if ! kubectl -n "${NODE_HELPER_NAMESPACE}" wait --for=jsonpath='{.status.phase}'=Succeeded "pod/${pod}" --timeout="${timeout_seconds}s"; then + if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded "${timeout_seconds}"; then kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true @@ -791,6 +852,7 @@ status_report() { echo "recovery_pending=${RECOVERY_PENDING}" echo "startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}" echo "last_checkpoint=${LAST_CHECKPOINT}" + echo "ups_host=${UPS_HOST_IN_USE:-${UPS_HOST}}" echo "ups_battery=${battery:-unknown}" echo "flux_source_ready=${flux_ready:-unknown}" echo "harbor_http=${harbor_code:-unknown}"