harbor/recovery: remove fixed titan-05 pin and auto-select ready arm64 node

This commit is contained in:
Brad Stein 2026-04-06 21:27:23 -03:00
parent 5e387e8e4d
commit d168f02c7f
3 changed files with 134 additions and 36 deletions

View File

@ -1,11 +1,11 @@
CANONICAL_CONTROL_HOST="titan-db"
DEFAULT_FLUX_BRANCH="main"
STATE_SUBDIR=".local/share/hecate"
STATE_SUBDIR=".local/share/ananke"
HARBOR_BUNDLE_BASENAME="harbor-bootstrap-v2.14.1-arm64.tar.zst"
HARBOR_TARGET_NODE="titan-05"
HARBOR_CANARY_NODE="titan-04"
HARBOR_TARGET_NODE=""
HARBOR_CANARY_NODE=""
HARBOR_CANARY_IMAGE="registry.bstein.dev/bstein/kubectl:1.35.0"
NODE_HELPER_IMAGE="registry.bstein.dev/bstein/hecate-node-helper:0.1.0"
NODE_HELPER_IMAGE="registry.bstein.dev/bstein/ananke-node-helper:0.1.0"
NODE_HELPER_NAMESPACE="maintenance"
NODE_HELPER_SERVICE_ACCOUNT="default"
REGISTRY_PULL_SECRET="harbor-regcred"

View File

@ -2,7 +2,7 @@
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_DIR="${HECATE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}"
REPO_DIR="${ANANKE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}"
BOOTSTRAP_DIR="${SCRIPT_DIR}/bootstrap"
CONFIG_FILE="${BOOTSTRAP_DIR}/recovery-config.env"
if [[ -f "${CONFIG_FILE}" ]]; then
@ -33,9 +33,10 @@ Options:
--ups-battery-key <key> UPS battery key for upsc (default: battery.charge)
--recovery-state-file <path> Recovery state file for outage-aware restart logic
--harbor-bundle-file <path> Harbor bootstrap bundle on the control host
--harbor-target-node <name> Node that should host Harbor during bootstrap (default: ${HARBOR_TARGET_NODE:-titan-05})
--harbor-target-node <name> Node that should host Harbor during bootstrap (default: auto)
--harbor-canary-node <name> Node used for Harbor pull canary (default: auto)
--harbor-canary-image <image> Harbor-backed image used for pull canary (default: ${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0})
--node-helper-image <image> Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0})
--node-helper-image <image> Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/ananke-node-helper:0.1.0})
--bundle-http-port <port> Temporary HTTP port used to serve bootstrap bundles (default: ${BUNDLE_HTTP_PORT:-8877})
--api-wait-timeout <seconds> Startup: Kubernetes API wait timeout (default: 600)
--drain-timeout <seconds> Worker drain timeout for normal shutdown (default: 180)
@ -86,16 +87,18 @@ DRAIN_TIMEOUT_SECONDS=180
EMERGENCY_DRAIN_TIMEOUT_SECONDS=45
API_WAIT_TIMEOUT_SECONDS=600
BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}"
STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/hecate}"
STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/ananke}"
RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state"
HARBOR_BUNDLE_FILE="${STATE_ROOT}/bundles/${HARBOR_BUNDLE_BASENAME:-harbor-bootstrap-v2.14.1-arm64.tar.zst}"
HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-titan-05}"
HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-titan-04}"
HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-}"
HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-}"
HARBOR_CANARY_IMAGE="${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}"
NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0}"
NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/ananke-node-helper:0.1.0}"
NODE_HELPER_NAMESPACE="${NODE_HELPER_NAMESPACE:-maintenance}"
NODE_HELPER_SERVICE_ACCOUNT="${NODE_HELPER_SERVICE_ACCOUNT:-default}"
NODE_HELPER_PREWARM_DS="${NODE_HELPER_PREWARM_DS:-ananke-node-helper-prewarm}"
REGISTRY_PULL_SECRET="${REGISTRY_PULL_SECRET:-harbor-regcred}"
KEEP_PREWARM_DAEMONSET=0
RECOVERY_PENDING=0
STARTUP_ATTEMPTED_DURING_OUTAGE=0
@ -169,6 +172,10 @@ while [[ $# -gt 0 ]]; do
HARBOR_TARGET_NODE="${2:?missing harbor target node}"
shift 2
;;
--harbor-canary-node)
HARBOR_CANARY_NODE="${2:?missing harbor canary node}"
shift 2
;;
--harbor-canary-image)
HARBOR_CANARY_IMAGE="${2:?missing canary image}"
shift 2
@ -432,6 +439,51 @@ discover_workers_csv() {
| paste -sd, -
}
node_is_ready() {
local node="$1"
[[ -n "${node}" ]] || return 1
local ready
ready="$(kubectl get node "${node}" -o jsonpath='{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || true)"
[[ "${ready}" == "True" ]]
}
select_ready_arm64_worker() {
local rows node
rows="$(kubectl get nodes -o 'custom-columns=NAME:.metadata.name,ARCH:.metadata.labels.kubernetes\.io/arch,WORKER:.metadata.labels.node-role\.kubernetes\.io/worker,HARDWARE:.metadata.labels.hardware,READY:.status.conditions[?(@.type=="Ready")].status' --no-headers 2>/dev/null || true)"
[[ -n "${rows}" ]] || return 1
node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $4=="rpi5" && $5=="True" {print $1; exit}')"
if [[ -n "${node}" ]]; then
printf '%s' "${node}"
return 0
fi
node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $4=="rpi4" && $5=="True" {print $1; exit}')"
if [[ -n "${node}" ]]; then
printf '%s' "${node}"
return 0
fi
node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $5=="True" {print $1; exit}')"
if [[ -n "${node}" ]]; then
printf '%s' "${node}"
return 0
fi
return 1
}
ensure_harbor_target_node() {
if node_is_ready "${HARBOR_TARGET_NODE}"; then
return 0
fi
local fallback
fallback="$(select_ready_arm64_worker || true)"
[[ -n "${fallback}" ]] || die "No Ready arm64 worker available for Harbor bootstrap target."
if [[ -n "${HARBOR_TARGET_NODE}" ]]; then
warn "Configured harbor target node '${HARBOR_TARGET_NODE}' is not Ready; using '${fallback}' instead."
else
log "harbor-target-node auto-selected: ${fallback}"
fi
HARBOR_TARGET_NODE="${fallback}"
}
as_array_from_csv() {
local csv="$1"
local out_var="$2"
@ -557,9 +609,18 @@ harbor_is_ready() {
}
run_harbor_pull_canary() {
local pod="hecate-harbor-canary"
local pod="ananke-harbor-canary"
local canary_node="${HARBOR_CANARY_NODE}"
if ! node_is_ready "${canary_node}"; then
ensure_harbor_target_node
canary_node="${HARBOR_TARGET_NODE}"
if [[ -n "${HARBOR_CANARY_NODE}" ]]; then
warn "Configured harbor canary node '${HARBOR_CANARY_NODE}' is not Ready; using '${canary_node}'."
fi
HARBOR_CANARY_NODE="${canary_node}"
fi
if [[ "${EXECUTE}" -eq 0 ]]; then
log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${HARBOR_CANARY_NODE}"
log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${canary_node}"
return 0
fi
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
@ -570,7 +631,7 @@ metadata:
name: ${pod}
namespace: ${NODE_HELPER_NAMESPACE}
spec:
nodeName: ${HARBOR_CANARY_NODE}
nodeName: ${canary_node}
restartPolicy: Never
imagePullSecrets:
- name: ${REGISTRY_PULL_SECRET}
@ -597,7 +658,7 @@ run_helper_pod() {
local purpose="$2"
local timeout_seconds="$3"
local script_content="$4"
local pod="hecate-$(sanitize_name "${purpose}")-$(date +%H%M%S)"
local pod="ananke-$(sanitize_name "${purpose}")-$(date +%H%M%S)"
local encoded_script
encoded_script="$(printf '%s' "${script_content}" | base64 -w0)"
@ -631,9 +692,9 @@ spec:
command: ["/bin/bash", "-ceu"]
args:
- |
printf '%s' '${encoded_script}' | base64 -d >/tmp/hecate-step.sh
chmod +x /tmp/hecate-step.sh
/tmp/hecate-step.sh
printf '%s' '${encoded_script}' | base64 -d >/tmp/ananke-step.sh
chmod +x /tmp/ananke-step.sh
/tmp/ananke-step.sh
POD
if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded "${timeout_seconds}"; then
@ -663,17 +724,36 @@ SCRIPT
run_helper_pod "${node}" "${purpose}" "${timeout_seconds}" "${script_content}"
}
run_host_command_via_prewarm_pod() {
local node="$1"
local host_command="$2"
local pod encoded_command
pod="$(kubectl -n "${NODE_HELPER_NAMESPACE}" get pods -l app="${NODE_HELPER_PREWARM_DS}" --field-selector "spec.nodeName=${node}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
if [[ -z "${pod}" ]]; then
return 1
fi
encoded_command="$(printf '%s' "${host_command}" | base64 -w0)"
if [[ "${EXECUTE}" -eq 0 ]]; then
log "DRY-RUN: helper exec via ${pod} on ${node}"
return 0
fi
run kubectl -n "${NODE_HELPER_NAMESPACE}" exec "${pod}" -- /bin/bash -ceu "HOST_COMMAND=\$(printf '%s' '${encoded_command}' | base64 -d); nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu \"\${HOST_COMMAND}\""
}
schedule_host_shutdown_via_helper() {
local node="$1"
local service_name="$2"
local delay_seconds="$3"
local host_command
host_command="/usr/bin/systemd-run --unit hecate-shutdown-${service_name} --on-active=${delay_seconds}s /bin/sh -lc '/usr/bin/systemctl stop ${service_name} || true; /usr/bin/systemctl poweroff || true'"
host_command="/usr/bin/systemd-run --unit ananke-shutdown-${service_name} --on-active=${delay_seconds}s /bin/sh -lc '/usr/bin/systemctl stop ${service_name} || true; /usr/bin/systemctl poweroff || true'"
if run_host_command_via_prewarm_pod "${node}" "${host_command}"; then
return 0
fi
run_host_command_via_helper "${node}" "shutdown-${node}-${service_name}" 120 "${host_command}"
}
prewarm_node_helper_image() {
local name="hecate-node-helper-prewarm"
local name="${NODE_HELPER_PREWARM_DS}"
if [[ "${EXECUTE}" -eq 0 ]]; then
log "DRY-RUN: prewarm ${NODE_HELPER_IMAGE} via temporary DaemonSet"
return 0
@ -711,7 +791,11 @@ DS
[[ -n "${ready}" ]] || ready=0
if [[ "${desired}" != "0" && "${desired}" == "${ready}" ]]; then
log "node-helper-prewarm=${ready}/${desired}"
kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${name}" --ignore-not-found >/dev/null 2>&1 || true
if [[ "${KEEP_PREWARM_DAEMONSET}" -eq 0 ]]; then
kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${name}" --ignore-not-found >/dev/null 2>&1 || true
else
log "Keeping ${name} DaemonSet running for shutdown helper exec path."
fi
return 0
fi
sleep 2
@ -722,6 +806,14 @@ DS
die "Timed out prewarming node helper image ${NODE_HELPER_IMAGE}"
}
cleanup_prewarm_daemonset() {
if [[ "${EXECUTE}" -eq 0 ]]; then
log "DRY-RUN: cleanup ${NODE_HELPER_PREWARM_DS} DaemonSet"
return 0
fi
kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${NODE_HELPER_PREWARM_DS}" --ignore-not-found >/dev/null 2>&1 || true
}
start_bundle_server() {
[[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle not found at ${HARBOR_BUNDLE_FILE}"
require_cmd python3
@ -732,7 +824,7 @@ start_bundle_server() {
log "DRY-RUN: serve ${bundle_name} from ${bundle_dir} on port ${BUNDLE_HTTP_PORT}"
return 0
fi
python3 -m http.server "${BUNDLE_HTTP_PORT}" --bind 0.0.0.0 --directory "${bundle_dir}" </dev/null >/tmp/hecate-bundle-server.log 2>&1 &
python3 -m http.server "${BUNDLE_HTTP_PORT}" --bind 0.0.0.0 --directory "${bundle_dir}" </dev/null >/tmp/ananke-bundle-server.log 2>&1 &
BUNDLE_SERVER_PID=$!
for _ in $(seq 1 20); do
if curl -fsS "http://127.0.0.1:${BUNDLE_HTTP_PORT}/${bundle_name}" >/dev/null 2>&1; then
@ -740,7 +832,7 @@ start_bundle_server() {
fi
sleep 1
done
die "Temporary bundle server did not become ready; see /tmp/hecate-bundle-server.log"
die "Temporary bundle server did not become ready; see /tmp/ananke-bundle-server.log"
}
stop_bundle_server() {
@ -762,6 +854,7 @@ control_host_ip() {
seed_harbor_images() {
local images_text control_ip bundle_name script_content seed_rc=0
[[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle not found at ${HARBOR_BUNDLE_FILE}"
ensure_harbor_target_node
images_text="$(sed '/^[[:space:]]*#/d;/^[[:space:]]*$/d' "${BOOTSTRAP_DIR}/harbor-bootstrap-images.txt")"
[[ -n "${images_text}" ]] || die "No Harbor images listed in ${BOOTSTRAP_DIR}/harbor-bootstrap-images.txt"
bundle_name="$(basename "${HARBOR_BUNDLE_FILE}")"
@ -839,15 +932,25 @@ resume_flux_and_reconcile() {
status_report() {
local battery flux_ready harbor_code workers
local effective_target effective_canary
battery="$(read_ups_battery || true)"
flux_ready="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
harbor_code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)"
workers="$(discover_workers_csv 2>/dev/null || true)"
effective_target="${HARBOR_TARGET_NODE}"
if ! node_is_ready "${effective_target}"; then
effective_target="$(select_ready_arm64_worker || true)"
fi
effective_canary="${HARBOR_CANARY_NODE}"
if ! node_is_ready "${effective_canary}"; then
effective_canary="${effective_target}"
fi
echo "mode=status"
echo "bundle_file=${HARBOR_BUNDLE_FILE}"
echo "bundle_present=$([[ -f "${HARBOR_BUNDLE_FILE}" ]] && echo true || echo false)"
echo "node_helper_image=${NODE_HELPER_IMAGE}"
echo "harbor_target_node=${HARBOR_TARGET_NODE}"
echo "harbor_target_node=${effective_target:-unknown}"
echo "harbor_canary_node=${effective_canary:-unknown}"
echo "workers=${workers}"
echo "recovery_pending=${RECOVERY_PENDING}"
echo "startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}"
@ -876,6 +979,7 @@ planned_shutdown() {
save_recovery_state 1 0 shutdown_started
if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
KEEP_PREWARM_DAEMONSET=1
prewarm_node_helper_image
mark_checkpoint shutdown_helper_prewarmed
fi
@ -911,6 +1015,9 @@ planned_shutdown() {
[[ -z "${node}" ]] && continue
schedule_host_shutdown_via_helper "${node}" k3s 45
done
if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
cleanup_prewarm_daemonset
fi
mark_checkpoint shutdown_control_planes_scheduled
log "Shutdown actions scheduled on hosts."
}
@ -1022,6 +1129,8 @@ log "mode=${MODE} execute=${EXECUTE}"
log "recovery-state-file=${RECOVERY_STATE_FILE}"
log "bundle-file=${HARBOR_BUNDLE_FILE}"
log "node-helper-image=${NODE_HELPER_IMAGE}"
log "harbor-target-node-config=${HARBOR_TARGET_NODE:-auto}"
log "harbor-canary-node-config=${HARBOR_CANARY_NODE:-auto}"
report_flux_source_state
case "${MODE}" in

View File

@ -41,6 +41,7 @@ spec:
ingress:
className: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
hosts:
@ -77,8 +78,6 @@ spec:
image:
repository: registry.bstein.dev/infra/harbor-redis
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
@ -113,8 +112,6 @@ spec:
image:
repository: registry.bstein.dev/infra/harbor-core
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
serviceAccountName: harbor-vault-sync
automountServiceAccountToken: true
existingSecret: harbor-core
@ -174,8 +171,6 @@ spec:
image:
repository: registry.bstein.dev/infra/harbor-jobservice
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
serviceAccountName: harbor-vault-sync
automountServiceAccountToken: true
existingSecret: harbor-jobservice
@ -216,8 +211,6 @@ spec:
image:
repository: registry.bstein.dev/infra/harbor-portal
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
@ -294,8 +287,6 @@ spec:
{{- with secret "kv/data/atlas/harbor/harbor-registry-htpasswd" -}}
{{ .Data.data.REGISTRY_HTPASSWD }}
{{- end }}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
@ -321,8 +312,6 @@ spec:
image:
repository: registry.bstein.dev/infra/harbor-nginx
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution: