hecate: harden titan-24 cleanup and ups telemetry
This commit is contained in:
parent
31f5709929
commit
65de56b2ac
@ -10,3 +10,5 @@ NODE_HELPER_NAMESPACE="maintenance"
|
||||
NODE_HELPER_SERVICE_ACCOUNT="default"
|
||||
REGISTRY_PULL_SECRET="harbor-regcred"
|
||||
BUNDLE_HTTP_PORT="8877"
|
||||
UPS_HOST="pyrphoros@localhost"
|
||||
UPS_BATTERY_KEY="battery.charge"
|
||||
|
||||
@ -78,10 +78,10 @@ SKIP_LOCAL_BOOTSTRAP=0
|
||||
SKIP_HARBOR_BOOTSTRAP=0
|
||||
SKIP_HARBOR_SEED=0
|
||||
SKIP_HELPER_PREWARM=0
|
||||
UPS_HOST="ups@localhost"
|
||||
UPS_BATTERY_KEY="battery.charge"
|
||||
MIN_STARTUP_BATTERY=35
|
||||
REQUIRE_UPS_BATTERY=0
|
||||
UPS_HOST="${UPS_HOST:-ups@localhost}"
|
||||
UPS_BATTERY_KEY="${UPS_BATTERY_KEY:-battery.charge}"
|
||||
MIN_STARTUP_BATTERY="${MIN_STARTUP_BATTERY:-35}"
|
||||
REQUIRE_UPS_BATTERY="${REQUIRE_UPS_BATTERY:-0}"
|
||||
DRAIN_TIMEOUT_SECONDS=180
|
||||
EMERGENCY_DRAIN_TIMEOUT_SECONDS=45
|
||||
API_WAIT_TIMEOUT_SECONDS=600
|
||||
@ -101,6 +101,7 @@ RECOVERY_PENDING=0
|
||||
STARTUP_ATTEMPTED_DURING_OUTAGE=0
|
||||
LAST_CHECKPOINT="none"
|
||||
BUNDLE_SERVER_PID=""
|
||||
UPS_HOST_IN_USE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
@ -273,6 +274,7 @@ load_recovery_state() {
|
||||
}
|
||||
|
||||
save_recovery_state() {
|
||||
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
||||
mkdir -p "$(state_dir)"
|
||||
cat > "${RECOVERY_STATE_FILE}" <<STATE
|
||||
recovery_pending=${1}
|
||||
@ -287,20 +289,52 @@ mark_checkpoint() {
|
||||
}
|
||||
|
||||
clear_recovery_state() {
|
||||
[[ "${EXECUTE}" -eq 1 ]] || return 0
|
||||
rm -f "${RECOVERY_STATE_FILE}" 2>/dev/null || true
|
||||
LAST_CHECKPOINT="none"
|
||||
}
|
||||
|
||||
sanitize_battery_percent() {
|
||||
local raw="$1"
|
||||
raw="${raw##*:}"
|
||||
raw="${raw//[[:space:]]/}"
|
||||
raw="${raw%%.*}"
|
||||
[[ "${raw}" =~ ^[0-9]+$ ]] || return 1
|
||||
printf '%s' "${raw}"
|
||||
}
|
||||
|
||||
candidate_ups_hosts() {
|
||||
local candidate name
|
||||
local -A seen=()
|
||||
if [[ -n "${UPS_HOST}" ]]; then
|
||||
seen["${UPS_HOST}"]=1
|
||||
echo "${UPS_HOST}"
|
||||
fi
|
||||
while IFS= read -r name; do
|
||||
[[ -n "${name}" ]] || continue
|
||||
for candidate in "${name}@localhost" "${name}"; do
|
||||
[[ -n "${seen[${candidate}]+x}" ]] && continue
|
||||
seen["${candidate}"]=1
|
||||
echo "${candidate}"
|
||||
done
|
||||
done < <(upsc -l 2>/dev/null || true)
|
||||
}
|
||||
|
||||
read_ups_battery() {
|
||||
if ! command -v upsc >/dev/null 2>&1; then
|
||||
return 1
|
||||
fi
|
||||
local raw
|
||||
raw="$(upsc "${UPS_HOST}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)"
|
||||
[[ -n "${raw}" ]] || return 1
|
||||
raw="${raw%%.*}"
|
||||
[[ "${raw}" =~ ^[0-9]+$ ]] || return 1
|
||||
printf '%s' "${raw}"
|
||||
local host raw parsed
|
||||
while IFS= read -r host; do
|
||||
raw="$(upsc "${host}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)"
|
||||
[[ -n "${raw}" ]] || continue
|
||||
parsed="$(sanitize_battery_percent "${raw}" || true)"
|
||||
[[ -n "${parsed}" ]] || continue
|
||||
UPS_HOST_IN_USE="${host}"
|
||||
printf '%s' "${parsed}"
|
||||
return 0
|
||||
done < <(candidate_ups_hosts)
|
||||
return 1
|
||||
}
|
||||
|
||||
ensure_minimum_battery_for_bootstrap() {
|
||||
@ -314,7 +348,7 @@ ensure_minimum_battery_for_bootstrap() {
|
||||
warn "Unable to read UPS battery status; continuing without hard battery gating."
|
||||
return 0
|
||||
fi
|
||||
log "ups-battery=${battery}%"
|
||||
log "ups-battery=${battery}% host=${UPS_HOST_IN_USE:-${UPS_HOST}}"
|
||||
if (( battery < MIN_STARTUP_BATTERY )); then
|
||||
warn "UPS battery ${battery}% below minimum startup threshold ${MIN_STARTUP_BATTERY}%."
|
||||
return 1
|
||||
@ -492,6 +526,29 @@ check_harbor_endpoint() {
|
||||
esac
|
||||
}
|
||||
|
||||
wait_for_pod_phase() {
|
||||
local namespace="$1"
|
||||
local pod="$2"
|
||||
local expected_phase="$3"
|
||||
local timeout_seconds="$4"
|
||||
local start now phase
|
||||
start="$(date +%s)"
|
||||
while true; do
|
||||
phase="$(kubectl -n "${namespace}" get pod "${pod}" -o jsonpath='{.status.phase}' 2>/dev/null || true)"
|
||||
if [[ "${phase}" == "${expected_phase}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
if [[ "${phase}" == "Failed" ]]; then
|
||||
return 1
|
||||
fi
|
||||
now="$(date +%s)"
|
||||
if (( now - start >= timeout_seconds )); then
|
||||
return 1
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
}
|
||||
|
||||
harbor_is_ready() {
|
||||
kubectl -n harbor get deploy harbor-core harbor-jobservice harbor-portal harbor-registry >/dev/null 2>&1 || return 1
|
||||
local code
|
||||
@ -525,8 +582,12 @@ spec:
|
||||
imagePullPolicy: Always
|
||||
command: ["sh", "-ceu", "echo harbor-canary-ok"]
|
||||
CANARY
|
||||
kubectl -n "${NODE_HELPER_NAMESPACE}" wait --for=condition=Ready "pod/${pod}" --timeout=180s >/dev/null 2>&1 || true
|
||||
kubectl -n "${NODE_HELPER_NAMESPACE}" wait --for=jsonpath='{.status.phase}'=Succeeded "pod/${pod}" --timeout=180s
|
||||
if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded 180; then
|
||||
kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true
|
||||
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true
|
||||
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
|
||||
return 1
|
||||
fi
|
||||
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" || true
|
||||
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
|
||||
}
|
||||
@ -575,7 +636,7 @@ spec:
|
||||
/tmp/hecate-step.sh
|
||||
POD
|
||||
|
||||
if ! kubectl -n "${NODE_HELPER_NAMESPACE}" wait --for=jsonpath='{.status.phase}'=Succeeded "pod/${pod}" --timeout="${timeout_seconds}s"; then
|
||||
if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded "${timeout_seconds}"; then
|
||||
kubectl -n "${NODE_HELPER_NAMESPACE}" describe pod "${pod}" >&2 || true
|
||||
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" logs "${pod}" >&2 || true
|
||||
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
|
||||
@ -791,6 +852,7 @@ status_report() {
|
||||
echo "recovery_pending=${RECOVERY_PENDING}"
|
||||
echo "startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}"
|
||||
echo "last_checkpoint=${LAST_CHECKPOINT}"
|
||||
echo "ups_host=${UPS_HOST_IN_USE:-${UPS_HOST}}"
|
||||
echo "ups_battery=${battery:-unknown}"
|
||||
echo "flux_source_ready=${flux_ready:-unknown}"
|
||||
echo "harbor_http=${harbor_code:-unknown}"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user