diff --git a/scripts/bootstrap/recovery-config.env b/scripts/bootstrap/recovery-config.env index c2f789d9..91227fbf 100644 --- a/scripts/bootstrap/recovery-config.env +++ b/scripts/bootstrap/recovery-config.env @@ -1,11 +1,11 @@ CANONICAL_CONTROL_HOST="titan-db" DEFAULT_FLUX_BRANCH="main" -STATE_SUBDIR=".local/share/hecate" +STATE_SUBDIR=".local/share/ananke" HARBOR_BUNDLE_BASENAME="harbor-bootstrap-v2.14.1-arm64.tar.zst" -HARBOR_TARGET_NODE="titan-05" -HARBOR_CANARY_NODE="titan-04" +HARBOR_TARGET_NODE="" +HARBOR_CANARY_NODE="" HARBOR_CANARY_IMAGE="registry.bstein.dev/bstein/kubectl:1.35.0" -NODE_HELPER_IMAGE="registry.bstein.dev/bstein/hecate-node-helper:0.1.0" +NODE_HELPER_IMAGE="registry.bstein.dev/bstein/ananke-node-helper:0.1.0" NODE_HELPER_NAMESPACE="maintenance" NODE_HELPER_SERVICE_ACCOUNT="default" REGISTRY_PULL_SECRET="harbor-regcred" diff --git a/scripts/cluster_power_recovery.sh b/scripts/cluster_power_recovery.sh index 9efafff1..62be81e9 100755 --- a/scripts/cluster_power_recovery.sh +++ b/scripts/cluster_power_recovery.sh @@ -2,7 +2,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_DIR="${HECATE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}" +REPO_DIR="${ANANKE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}" BOOTSTRAP_DIR="${SCRIPT_DIR}/bootstrap" CONFIG_FILE="${BOOTSTRAP_DIR}/recovery-config.env" if [[ -f "${CONFIG_FILE}" ]]; then @@ -33,9 +33,10 @@ Options: --ups-battery-key UPS battery key for upsc (default: battery.charge) --recovery-state-file Recovery state file for outage-aware restart logic --harbor-bundle-file Harbor bootstrap bundle on the control host - --harbor-target-node Node that should host Harbor during bootstrap (default: ${HARBOR_TARGET_NODE:-titan-05}) + --harbor-target-node Node that should host Harbor during bootstrap (default: auto) + --harbor-canary-node Node used for Harbor pull canary (default: auto) --harbor-canary-image Harbor-backed image used for pull canary (default: ${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}) - --node-helper-image Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0}) + --node-helper-image Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/ananke-node-helper:0.1.0}) --bundle-http-port Temporary HTTP port used to serve bootstrap bundles (default: ${BUNDLE_HTTP_PORT:-8877}) --api-wait-timeout Startup: Kubernetes API wait timeout (default: 600) --drain-timeout Worker drain timeout for normal shutdown (default: 180) @@ -86,16 +87,18 @@ DRAIN_TIMEOUT_SECONDS=180 EMERGENCY_DRAIN_TIMEOUT_SECONDS=45 API_WAIT_TIMEOUT_SECONDS=600 BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}" -STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/hecate}" +STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/ananke}" RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state" HARBOR_BUNDLE_FILE="${STATE_ROOT}/bundles/${HARBOR_BUNDLE_BASENAME:-harbor-bootstrap-v2.14.1-arm64.tar.zst}" -HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-titan-05}" -HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-titan-04}" +HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-}" +HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-}" HARBOR_CANARY_IMAGE="${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}" -NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0}" +NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/ananke-node-helper:0.1.0}" NODE_HELPER_NAMESPACE="${NODE_HELPER_NAMESPACE:-maintenance}" NODE_HELPER_SERVICE_ACCOUNT="${NODE_HELPER_SERVICE_ACCOUNT:-default}" +NODE_HELPER_PREWARM_DS="${NODE_HELPER_PREWARM_DS:-ananke-node-helper-prewarm}" REGISTRY_PULL_SECRET="${REGISTRY_PULL_SECRET:-harbor-regcred}" +KEEP_PREWARM_DAEMONSET=0 RECOVERY_PENDING=0 STARTUP_ATTEMPTED_DURING_OUTAGE=0 @@ -169,6 +172,10 @@ while [[ $# -gt 0 ]]; do HARBOR_TARGET_NODE="${2:?missing harbor target node}" shift 2 ;; + --harbor-canary-node) + HARBOR_CANARY_NODE="${2:?missing harbor canary node}" + shift 2 + ;; --harbor-canary-image) HARBOR_CANARY_IMAGE="${2:?missing canary image}" shift 2 @@ -432,6 +439,51 @@ discover_workers_csv() { | paste -sd, - } +node_is_ready() { + local node="$1" + [[ -n "${node}" ]] || return 1 + local ready + ready="$(kubectl get node "${node}" -o jsonpath='{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || true)" + [[ "${ready}" == "True" ]] +} + +select_ready_arm64_worker() { + local rows node + rows="$(kubectl get nodes -o 'custom-columns=NAME:.metadata.name,ARCH:.metadata.labels.kubernetes\.io/arch,WORKER:.metadata.labels.node-role\.kubernetes\.io/worker,HARDWARE:.metadata.labels.hardware,READY:.status.conditions[?(@.type=="Ready")].status' --no-headers 2>/dev/null || true)" + [[ -n "${rows}" ]] || return 1 + node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $4=="rpi5" && $5=="True" {print $1; exit}')" + if [[ -n "${node}" ]]; then + printf '%s' "${node}" + return 0 + fi + node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $4=="rpi4" && $5=="True" {print $1; exit}')" + if [[ -n "${node}" ]]; then + printf '%s' "${node}" + return 0 + fi + node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $5=="True" {print $1; exit}')" + if [[ -n "${node}" ]]; then + printf '%s' "${node}" + return 0 + fi + return 1 +} + +ensure_harbor_target_node() { + if node_is_ready "${HARBOR_TARGET_NODE}"; then + return 0 + fi + local fallback + fallback="$(select_ready_arm64_worker || true)" + [[ -n "${fallback}" ]] || die "No Ready arm64 worker available for Harbor bootstrap target." + if [[ -n "${HARBOR_TARGET_NODE}" ]]; then + warn "Configured harbor target node '${HARBOR_TARGET_NODE}' is not Ready; using '${fallback}' instead." + else + log "harbor-target-node auto-selected: ${fallback}" + fi + HARBOR_TARGET_NODE="${fallback}" +} + as_array_from_csv() { local csv="$1" local out_var="$2" @@ -557,9 +609,18 @@ harbor_is_ready() { } run_harbor_pull_canary() { - local pod="hecate-harbor-canary" + local pod="ananke-harbor-canary" + local canary_node="${HARBOR_CANARY_NODE}" + if ! node_is_ready "${canary_node}"; then + ensure_harbor_target_node + canary_node="${HARBOR_TARGET_NODE}" + if [[ -n "${HARBOR_CANARY_NODE}" ]]; then + warn "Configured harbor canary node '${HARBOR_CANARY_NODE}' is not Ready; using '${canary_node}'." + fi + HARBOR_CANARY_NODE="${canary_node}" + fi if [[ "${EXECUTE}" -eq 0 ]]; then - log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${HARBOR_CANARY_NODE}" + log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${canary_node}" return 0 fi timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true @@ -570,7 +631,7 @@ metadata: name: ${pod} namespace: ${NODE_HELPER_NAMESPACE} spec: - nodeName: ${HARBOR_CANARY_NODE} + nodeName: ${canary_node} restartPolicy: Never imagePullSecrets: - name: ${REGISTRY_PULL_SECRET} @@ -597,7 +658,7 @@ run_helper_pod() { local purpose="$2" local timeout_seconds="$3" local script_content="$4" - local pod="hecate-$(sanitize_name "${purpose}")-$(date +%H%M%S)" + local pod="ananke-$(sanitize_name "${purpose}")-$(date +%H%M%S)" local encoded_script encoded_script="$(printf '%s' "${script_content}" | base64 -w0)" @@ -631,9 +692,9 @@ spec: command: ["/bin/bash", "-ceu"] args: - | - printf '%s' '${encoded_script}' | base64 -d >/tmp/hecate-step.sh - chmod +x /tmp/hecate-step.sh - /tmp/hecate-step.sh + printf '%s' '${encoded_script}' | base64 -d >/tmp/ananke-step.sh + chmod +x /tmp/ananke-step.sh + /tmp/ananke-step.sh POD if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded "${timeout_seconds}"; then @@ -663,17 +724,36 @@ SCRIPT run_helper_pod "${node}" "${purpose}" "${timeout_seconds}" "${script_content}" } +run_host_command_via_prewarm_pod() { + local node="$1" + local host_command="$2" + local pod encoded_command + pod="$(kubectl -n "${NODE_HELPER_NAMESPACE}" get pods -l app="${NODE_HELPER_PREWARM_DS}" --field-selector "spec.nodeName=${node}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)" + if [[ -z "${pod}" ]]; then + return 1 + fi + encoded_command="$(printf '%s' "${host_command}" | base64 -w0)" + if [[ "${EXECUTE}" -eq 0 ]]; then + log "DRY-RUN: helper exec via ${pod} on ${node}" + return 0 + fi + run kubectl -n "${NODE_HELPER_NAMESPACE}" exec "${pod}" -- /bin/bash -ceu "HOST_COMMAND=\$(printf '%s' '${encoded_command}' | base64 -d); nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu \"\${HOST_COMMAND}\"" +} + schedule_host_shutdown_via_helper() { local node="$1" local service_name="$2" local delay_seconds="$3" local host_command - host_command="/usr/bin/systemd-run --unit hecate-shutdown-${service_name} --on-active=${delay_seconds}s /bin/sh -lc '/usr/bin/systemctl stop ${service_name} || true; /usr/bin/systemctl poweroff || true'" + host_command="/usr/bin/systemd-run --unit ananke-shutdown-${service_name} --on-active=${delay_seconds}s /bin/sh -lc '/usr/bin/systemctl stop ${service_name} || true; /usr/bin/systemctl poweroff || true'" + if run_host_command_via_prewarm_pod "${node}" "${host_command}"; then + return 0 + fi run_host_command_via_helper "${node}" "shutdown-${node}-${service_name}" 120 "${host_command}" } prewarm_node_helper_image() { - local name="hecate-node-helper-prewarm" + local name="${NODE_HELPER_PREWARM_DS}" if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: prewarm ${NODE_HELPER_IMAGE} via temporary DaemonSet" return 0 @@ -711,7 +791,11 @@ DS [[ -n "${ready}" ]] || ready=0 if [[ "${desired}" != "0" && "${desired}" == "${ready}" ]]; then log "node-helper-prewarm=${ready}/${desired}" - kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${name}" --ignore-not-found >/dev/null 2>&1 || true + if [[ "${KEEP_PREWARM_DAEMONSET}" -eq 0 ]]; then + kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${name}" --ignore-not-found >/dev/null 2>&1 || true + else + log "Keeping ${name} DaemonSet running for shutdown helper exec path." + fi return 0 fi sleep 2 @@ -722,6 +806,14 @@ DS die "Timed out prewarming node helper image ${NODE_HELPER_IMAGE}" } +cleanup_prewarm_daemonset() { + if [[ "${EXECUTE}" -eq 0 ]]; then + log "DRY-RUN: cleanup ${NODE_HELPER_PREWARM_DS} DaemonSet" + return 0 + fi + kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${NODE_HELPER_PREWARM_DS}" --ignore-not-found >/dev/null 2>&1 || true +} + start_bundle_server() { [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle not found at ${HARBOR_BUNDLE_FILE}" require_cmd python3 @@ -732,7 +824,7 @@ start_bundle_server() { log "DRY-RUN: serve ${bundle_name} from ${bundle_dir} on port ${BUNDLE_HTTP_PORT}" return 0 fi - python3 -m http.server "${BUNDLE_HTTP_PORT}" --bind 0.0.0.0 --directory "${bundle_dir}" /tmp/hecate-bundle-server.log 2>&1 & + python3 -m http.server "${BUNDLE_HTTP_PORT}" --bind 0.0.0.0 --directory "${bundle_dir}" /tmp/ananke-bundle-server.log 2>&1 & BUNDLE_SERVER_PID=$! for _ in $(seq 1 20); do if curl -fsS "http://127.0.0.1:${BUNDLE_HTTP_PORT}/${bundle_name}" >/dev/null 2>&1; then @@ -740,7 +832,7 @@ start_bundle_server() { fi sleep 1 done - die "Temporary bundle server did not become ready; see /tmp/hecate-bundle-server.log" + die "Temporary bundle server did not become ready; see /tmp/ananke-bundle-server.log" } stop_bundle_server() { @@ -762,6 +854,7 @@ control_host_ip() { seed_harbor_images() { local images_text control_ip bundle_name script_content seed_rc=0 [[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle not found at ${HARBOR_BUNDLE_FILE}" + ensure_harbor_target_node images_text="$(sed '/^[[:space:]]*#/d;/^[[:space:]]*$/d' "${BOOTSTRAP_DIR}/harbor-bootstrap-images.txt")" [[ -n "${images_text}" ]] || die "No Harbor images listed in ${BOOTSTRAP_DIR}/harbor-bootstrap-images.txt" bundle_name="$(basename "${HARBOR_BUNDLE_FILE}")" @@ -839,15 +932,25 @@ resume_flux_and_reconcile() { status_report() { local battery flux_ready harbor_code workers + local effective_target effective_canary battery="$(read_ups_battery || true)" flux_ready="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)" harbor_code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)" workers="$(discover_workers_csv 2>/dev/null || true)" + effective_target="${HARBOR_TARGET_NODE}" + if ! node_is_ready "${effective_target}"; then + effective_target="$(select_ready_arm64_worker || true)" + fi + effective_canary="${HARBOR_CANARY_NODE}" + if ! node_is_ready "${effective_canary}"; then + effective_canary="${effective_target}" + fi echo "mode=status" echo "bundle_file=${HARBOR_BUNDLE_FILE}" echo "bundle_present=$([[ -f "${HARBOR_BUNDLE_FILE}" ]] && echo true || echo false)" echo "node_helper_image=${NODE_HELPER_IMAGE}" - echo "harbor_target_node=${HARBOR_TARGET_NODE}" + echo "harbor_target_node=${effective_target:-unknown}" + echo "harbor_canary_node=${effective_canary:-unknown}" echo "workers=${workers}" echo "recovery_pending=${RECOVERY_PENDING}" echo "startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}" @@ -876,6 +979,7 @@ planned_shutdown() { save_recovery_state 1 0 shutdown_started if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then + KEEP_PREWARM_DAEMONSET=1 prewarm_node_helper_image mark_checkpoint shutdown_helper_prewarmed fi @@ -911,6 +1015,9 @@ planned_shutdown() { [[ -z "${node}" ]] && continue schedule_host_shutdown_via_helper "${node}" k3s 45 done + if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then + cleanup_prewarm_daemonset + fi mark_checkpoint shutdown_control_planes_scheduled log "Shutdown actions scheduled on hosts." } @@ -1022,6 +1129,8 @@ log "mode=${MODE} execute=${EXECUTE}" log "recovery-state-file=${RECOVERY_STATE_FILE}" log "bundle-file=${HARBOR_BUNDLE_FILE}" log "node-helper-image=${NODE_HELPER_IMAGE}" +log "harbor-target-node-config=${HARBOR_TARGET_NODE:-auto}" +log "harbor-canary-node-config=${HARBOR_CANARY_NODE:-auto}" report_flux_source_state case "${MODE}" in diff --git a/services/harbor/helmrelease.yaml b/services/harbor/helmrelease.yaml index bec28eeb..371bd4fd 100644 --- a/services/harbor/helmrelease.yaml +++ b/services/harbor/helmrelease.yaml @@ -41,6 +41,7 @@ spec: ingress: className: traefik annotations: + cert-manager.io/cluster-issuer: letsencrypt traefik.ingress.kubernetes.io/router.entrypoints: websecure traefik.ingress.kubernetes.io/router.tls: "true" hosts: @@ -77,8 +78,6 @@ spec: image: repository: registry.bstein.dev/infra/harbor-redis tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"} - nodeSelector: - kubernetes.io/hostname: titan-05 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -113,8 +112,6 @@ spec: image: repository: registry.bstein.dev/infra/harbor-core tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"} - nodeSelector: - kubernetes.io/hostname: titan-05 serviceAccountName: harbor-vault-sync automountServiceAccountToken: true existingSecret: harbor-core @@ -174,8 +171,6 @@ spec: image: repository: registry.bstein.dev/infra/harbor-jobservice tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"} - nodeSelector: - kubernetes.io/hostname: titan-05 serviceAccountName: harbor-vault-sync automountServiceAccountToken: true existingSecret: harbor-jobservice @@ -216,8 +211,6 @@ spec: image: repository: registry.bstein.dev/infra/harbor-portal tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"} - nodeSelector: - kubernetes.io/hostname: titan-05 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -294,8 +287,6 @@ spec: {{- with secret "kv/data/atlas/harbor/harbor-registry-htpasswd" -}} {{ .Data.data.REGISTRY_HTPASSWD }} {{- end }} - nodeSelector: - kubernetes.io/hostname: titan-05 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -321,8 +312,6 @@ spec: image: repository: registry.bstein.dev/infra/harbor-nginx tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"} - nodeSelector: - kubernetes.io/hostname: titan-05 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: