hecate: harden titan-24 cleanup and ups telemetry

This commit is contained in:
Brad Stein 2026-04-06 04:47:05 -03:00
parent 31f5709929
commit 65de56b2ac
2 changed files with 78 additions and 14 deletions

View File

@ -10,3 +10,5 @@ NODE_HELPER_NAMESPACE="maintenance"
NODE_HELPER_SERVICE_ACCOUNT="default"
REGISTRY_PULL_SECRET="harbor-regcred"
BUNDLE_HTTP_PORT="8877"
UPS_HOST="pyrphoros@localhost"
UPS_BATTERY_KEY="battery.charge"

View File

@ -78,10 +78,10 @@ SKIP_LOCAL_BOOTSTRAP=0
SKIP_HARBOR_BOOTSTRAP=0
SKIP_HARBOR_SEED=0
SKIP_HELPER_PREWARM=0
UPS_HOST="ups@localhost"
UPS_BATTERY_KEY="battery.charge"
MIN_STARTUP_BATTERY=35
REQUIRE_UPS_BATTERY=0
UPS_HOST="${UPS_HOST:-ups@localhost}"
UPS_BATTERY_KEY="${UPS_BATTERY_KEY:-battery.charge}"
MIN_STARTUP_BATTERY="${MIN_STARTUP_BATTERY:-35}"
REQUIRE_UPS_BATTERY="${REQUIRE_UPS_BATTERY:-0}"
DRAIN_TIMEOUT_SECONDS=180
EMERGENCY_DRAIN_TIMEOUT_SECONDS=45
API_WAIT_TIMEOUT_SECONDS=600
@ -101,6 +101,7 @@ RECOVERY_PENDING=0
STARTUP_ATTEMPTED_DURING_OUTAGE=0
LAST_CHECKPOINT="none"
BUNDLE_SERVER_PID=""
UPS_HOST_IN_USE=""
while [[ $# -gt 0 ]]; do
case "$1" in
@ -273,6 +274,7 @@ load_recovery_state() {
}
save_recovery_state() {
[[ "${EXECUTE}" -eq 1 ]] || return 0
mkdir -p "$(state_dir)"
cat > "${RECOVERY_STATE_FILE}" <<STATE
recovery_pending=${1}
@ -287,20 +289,52 @@ mark_checkpoint() {
}
clear_recovery_state() {
[[ "${EXECUTE}" -eq 1 ]] || return 0
rm -f "${RECOVERY_STATE_FILE}" 2>/dev/null || true
LAST_CHECKPOINT="none"
}
sanitize_battery_percent() {
local raw="$1"
raw="${raw##*:}"
raw="${raw//[[:space:]]/}"
raw="${raw%%.*}"
[[ "${raw}" =~ ^[0-9]+$ ]] || return 1
printf '%s' "${raw}"
}
candidate_ups_hosts() {
local candidate name
local -A seen=()
if [[ -n "${UPS_HOST}" ]]; then
seen["${UPS_HOST}"]=1
echo "${UPS_HOST}"
fi
while IFS= read -r name; do
[[ -n "${name}" ]] || continue
for candidate in "${name}@localhost" "${name}"; do
[[ -n "${seen[${candidate}]+x}" ]] && continue
seen["${candidate}"]=1
echo "${candidate}"
done
done < <(upsc -l 2>/dev/null || true)
}
read_ups_battery() {
if ! command -v upsc >/dev/null 2>&1; then
return 1
fi
local raw
raw="$(upsc "${UPS_HOST}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)"
[[ -n "${raw}" ]] || return 1
raw="${raw%%.*}"
[[ "${raw}" =~ ^[0-9]+$ ]] || return 1
printf '%s' "${raw}"
local host raw parsed
while IFS= read -r host; do
raw="$(upsc "${host}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)"
[[ -n "${raw}" ]] || continue
parsed="$(sanitize_battery_percent "${raw}" || true)"
[[ -n "${parsed}" ]] || continue
UPS_HOST_IN_USE="${host}"
printf '%s' "${parsed}"
return 0
done < <(candidate_ups_hosts)
return 1
}
ensure_minimum_battery_for_bootstrap() {
@ -314,7 +348,7 @@ ensure_minimum_battery_for_bootstrap() {
warn "Unable to read UPS battery status; continuing without hard battery gating."
return 0
fi
log "ups-battery=${battery}%"
log "ups-battery=${battery}% host=${UPS_HOST_IN_USE:-${UPS_HOST}}"
if (( battery < MIN_STARTUP_BATTERY )); then
warn "UPS battery ${battery}% below minimum startup threshold ${MIN_STARTUP_BATTERY}%."
return 1
@ -492,6 +526,29 @@ check_harbor_endpoint() {
esac
}
wait_for_pod_phase() {
local namespace="$1"
local pod="$2"
local expected_phase="$3"
local timeout_seconds="$4"
local start now phase
start="$(date +%s)"
while true; do
phase="$(kubectl -n "${namespace}" get pod "${pod}" -o jsonpath='{.status.phase}' 2>/dev/null || true)"
if [[ "${phase}" == "${expected_phase}" ]]; then
return 0
fi
if [[ "${phase}" == "Failed" ]]; then
return 1
fi
now="$(date +%s)"
if (( now - start >= timeout_seconds )); then
return 1
fi
sleep 2
done
}
harbor_is_ready() {
kubectl -n harbor get deploy harbor-core harbor-jobservice harbor-portal harbor-registry >/dev/null 2>&1 || return 1
local code
@ -525,8 +582,12 @@ spec:
imagePullPolicy: Always
command: ["sh", "-ceu", "echo harbor-canary-ok"]
CANARY
kubectl -n "${NODE_HELPER_NAMESPACE}" wait --for=condition=Ready "pod/${pod}" --timeout=180s >/dev/null 2>&1 || true
kubectl -n "${NODE_HELPER_NAMESPACE}" wait --for=jsonpath='{.status.phase}'=Succeeded "pod/${pod}" --timeout=180s
if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded 180; then
kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
return 1
fi
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
}
@ -575,7 +636,7 @@ spec:
/tmp/hecate-step.sh
POD
if ! kubectl -n "${NODE_HELPER_NAMESPACE}" wait --for=jsonpath='{.status.phase}'=Succeeded "pod/${pod}" --timeout="${timeout_seconds}s"; then
if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded "${timeout_seconds}"; then
kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
@ -791,6 +852,7 @@ status_report() {
echo "recovery_pending=${RECOVERY_PENDING}"
echo "startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}"
echo "last_checkpoint=${LAST_CHECKPOINT}"
echo "ups_host=${UPS_HOST_IN_USE:-${UPS_HOST}}"
echo "ups_battery=${battery:-unknown}"
echo "flux_source_ready=${flux_ready:-unknown}"
echo "harbor_http=${harbor_code:-unknown}"