harbor/recovery: remove fixed titan-05 pin and auto-select ready arm64 node
This commit is contained in:
parent
5e387e8e4d
commit
d168f02c7f
@ -1,11 +1,11 @@
|
|||||||
CANONICAL_CONTROL_HOST="titan-db"
|
CANONICAL_CONTROL_HOST="titan-db"
|
||||||
DEFAULT_FLUX_BRANCH="main"
|
DEFAULT_FLUX_BRANCH="main"
|
||||||
STATE_SUBDIR=".local/share/hecate"
|
STATE_SUBDIR=".local/share/ananke"
|
||||||
HARBOR_BUNDLE_BASENAME="harbor-bootstrap-v2.14.1-arm64.tar.zst"
|
HARBOR_BUNDLE_BASENAME="harbor-bootstrap-v2.14.1-arm64.tar.zst"
|
||||||
HARBOR_TARGET_NODE="titan-05"
|
HARBOR_TARGET_NODE=""
|
||||||
HARBOR_CANARY_NODE="titan-04"
|
HARBOR_CANARY_NODE=""
|
||||||
HARBOR_CANARY_IMAGE="registry.bstein.dev/bstein/kubectl:1.35.0"
|
HARBOR_CANARY_IMAGE="registry.bstein.dev/bstein/kubectl:1.35.0"
|
||||||
NODE_HELPER_IMAGE="registry.bstein.dev/bstein/hecate-node-helper:0.1.0"
|
NODE_HELPER_IMAGE="registry.bstein.dev/bstein/ananke-node-helper:0.1.0"
|
||||||
NODE_HELPER_NAMESPACE="maintenance"
|
NODE_HELPER_NAMESPACE="maintenance"
|
||||||
NODE_HELPER_SERVICE_ACCOUNT="default"
|
NODE_HELPER_SERVICE_ACCOUNT="default"
|
||||||
REGISTRY_PULL_SECRET="harbor-regcred"
|
REGISTRY_PULL_SECRET="harbor-regcred"
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
REPO_DIR="${HECATE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}"
|
REPO_DIR="${ANANKE_REPO_DIR:-$(cd "${SCRIPT_DIR}/.." && pwd)}"
|
||||||
BOOTSTRAP_DIR="${SCRIPT_DIR}/bootstrap"
|
BOOTSTRAP_DIR="${SCRIPT_DIR}/bootstrap"
|
||||||
CONFIG_FILE="${BOOTSTRAP_DIR}/recovery-config.env"
|
CONFIG_FILE="${BOOTSTRAP_DIR}/recovery-config.env"
|
||||||
if [[ -f "${CONFIG_FILE}" ]]; then
|
if [[ -f "${CONFIG_FILE}" ]]; then
|
||||||
@ -33,9 +33,10 @@ Options:
|
|||||||
--ups-battery-key <key> UPS battery key for upsc (default: battery.charge)
|
--ups-battery-key <key> UPS battery key for upsc (default: battery.charge)
|
||||||
--recovery-state-file <path> Recovery state file for outage-aware restart logic
|
--recovery-state-file <path> Recovery state file for outage-aware restart logic
|
||||||
--harbor-bundle-file <path> Harbor bootstrap bundle on the control host
|
--harbor-bundle-file <path> Harbor bootstrap bundle on the control host
|
||||||
--harbor-target-node <name> Node that should host Harbor during bootstrap (default: ${HARBOR_TARGET_NODE:-titan-05})
|
--harbor-target-node <name> Node that should host Harbor during bootstrap (default: auto)
|
||||||
|
--harbor-canary-node <name> Node used for Harbor pull canary (default: auto)
|
||||||
--harbor-canary-image <image> Harbor-backed image used for pull canary (default: ${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0})
|
--harbor-canary-image <image> Harbor-backed image used for pull canary (default: ${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0})
|
||||||
--node-helper-image <image> Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0})
|
--node-helper-image <image> Privileged helper image used for host operations (default: ${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/ananke-node-helper:0.1.0})
|
||||||
--bundle-http-port <port> Temporary HTTP port used to serve bootstrap bundles (default: ${BUNDLE_HTTP_PORT:-8877})
|
--bundle-http-port <port> Temporary HTTP port used to serve bootstrap bundles (default: ${BUNDLE_HTTP_PORT:-8877})
|
||||||
--api-wait-timeout <seconds> Startup: Kubernetes API wait timeout (default: 600)
|
--api-wait-timeout <seconds> Startup: Kubernetes API wait timeout (default: 600)
|
||||||
--drain-timeout <seconds> Worker drain timeout for normal shutdown (default: 180)
|
--drain-timeout <seconds> Worker drain timeout for normal shutdown (default: 180)
|
||||||
@ -86,16 +87,18 @@ DRAIN_TIMEOUT_SECONDS=180
|
|||||||
EMERGENCY_DRAIN_TIMEOUT_SECONDS=45
|
EMERGENCY_DRAIN_TIMEOUT_SECONDS=45
|
||||||
API_WAIT_TIMEOUT_SECONDS=600
|
API_WAIT_TIMEOUT_SECONDS=600
|
||||||
BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}"
|
BUNDLE_HTTP_PORT="${BUNDLE_HTTP_PORT:-8877}"
|
||||||
STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/hecate}"
|
STATE_ROOT="${HOME}/${STATE_SUBDIR:-.local/share/ananke}"
|
||||||
RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state"
|
RECOVERY_STATE_FILE="${STATE_ROOT}/cluster_power_recovery.state"
|
||||||
HARBOR_BUNDLE_FILE="${STATE_ROOT}/bundles/${HARBOR_BUNDLE_BASENAME:-harbor-bootstrap-v2.14.1-arm64.tar.zst}"
|
HARBOR_BUNDLE_FILE="${STATE_ROOT}/bundles/${HARBOR_BUNDLE_BASENAME:-harbor-bootstrap-v2.14.1-arm64.tar.zst}"
|
||||||
HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-titan-05}"
|
HARBOR_TARGET_NODE="${HARBOR_TARGET_NODE:-}"
|
||||||
HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-titan-04}"
|
HARBOR_CANARY_NODE="${HARBOR_CANARY_NODE:-}"
|
||||||
HARBOR_CANARY_IMAGE="${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}"
|
HARBOR_CANARY_IMAGE="${HARBOR_CANARY_IMAGE:-registry.bstein.dev/bstein/kubectl:1.35.0}"
|
||||||
NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/hecate-node-helper:0.1.0}"
|
NODE_HELPER_IMAGE="${NODE_HELPER_IMAGE:-registry.bstein.dev/bstein/ananke-node-helper:0.1.0}"
|
||||||
NODE_HELPER_NAMESPACE="${NODE_HELPER_NAMESPACE:-maintenance}"
|
NODE_HELPER_NAMESPACE="${NODE_HELPER_NAMESPACE:-maintenance}"
|
||||||
NODE_HELPER_SERVICE_ACCOUNT="${NODE_HELPER_SERVICE_ACCOUNT:-default}"
|
NODE_HELPER_SERVICE_ACCOUNT="${NODE_HELPER_SERVICE_ACCOUNT:-default}"
|
||||||
|
NODE_HELPER_PREWARM_DS="${NODE_HELPER_PREWARM_DS:-ananke-node-helper-prewarm}"
|
||||||
REGISTRY_PULL_SECRET="${REGISTRY_PULL_SECRET:-harbor-regcred}"
|
REGISTRY_PULL_SECRET="${REGISTRY_PULL_SECRET:-harbor-regcred}"
|
||||||
|
KEEP_PREWARM_DAEMONSET=0
|
||||||
|
|
||||||
RECOVERY_PENDING=0
|
RECOVERY_PENDING=0
|
||||||
STARTUP_ATTEMPTED_DURING_OUTAGE=0
|
STARTUP_ATTEMPTED_DURING_OUTAGE=0
|
||||||
@ -169,6 +172,10 @@ while [[ $# -gt 0 ]]; do
|
|||||||
HARBOR_TARGET_NODE="${2:?missing harbor target node}"
|
HARBOR_TARGET_NODE="${2:?missing harbor target node}"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
|
--harbor-canary-node)
|
||||||
|
HARBOR_CANARY_NODE="${2:?missing harbor canary node}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
--harbor-canary-image)
|
--harbor-canary-image)
|
||||||
HARBOR_CANARY_IMAGE="${2:?missing canary image}"
|
HARBOR_CANARY_IMAGE="${2:?missing canary image}"
|
||||||
shift 2
|
shift 2
|
||||||
@ -432,6 +439,51 @@ discover_workers_csv() {
|
|||||||
| paste -sd, -
|
| paste -sd, -
|
||||||
}
|
}
|
||||||
|
|
||||||
|
node_is_ready() {
|
||||||
|
local node="$1"
|
||||||
|
[[ -n "${node}" ]] || return 1
|
||||||
|
local ready
|
||||||
|
ready="$(kubectl get node "${node}" -o jsonpath='{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || true)"
|
||||||
|
[[ "${ready}" == "True" ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
select_ready_arm64_worker() {
|
||||||
|
local rows node
|
||||||
|
rows="$(kubectl get nodes -o 'custom-columns=NAME:.metadata.name,ARCH:.metadata.labels.kubernetes\.io/arch,WORKER:.metadata.labels.node-role\.kubernetes\.io/worker,HARDWARE:.metadata.labels.hardware,READY:.status.conditions[?(@.type=="Ready")].status' --no-headers 2>/dev/null || true)"
|
||||||
|
[[ -n "${rows}" ]] || return 1
|
||||||
|
node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $4=="rpi5" && $5=="True" {print $1; exit}')"
|
||||||
|
if [[ -n "${node}" ]]; then
|
||||||
|
printf '%s' "${node}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $4=="rpi4" && $5=="True" {print $1; exit}')"
|
||||||
|
if [[ -n "${node}" ]]; then
|
||||||
|
printf '%s' "${node}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
node="$(printf '%s\n' "${rows}" | awk '$2=="arm64" && $3=="true" && $5=="True" {print $1; exit}')"
|
||||||
|
if [[ -n "${node}" ]]; then
|
||||||
|
printf '%s' "${node}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_harbor_target_node() {
|
||||||
|
if node_is_ready "${HARBOR_TARGET_NODE}"; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
local fallback
|
||||||
|
fallback="$(select_ready_arm64_worker || true)"
|
||||||
|
[[ -n "${fallback}" ]] || die "No Ready arm64 worker available for Harbor bootstrap target."
|
||||||
|
if [[ -n "${HARBOR_TARGET_NODE}" ]]; then
|
||||||
|
warn "Configured harbor target node '${HARBOR_TARGET_NODE}' is not Ready; using '${fallback}' instead."
|
||||||
|
else
|
||||||
|
log "harbor-target-node auto-selected: ${fallback}"
|
||||||
|
fi
|
||||||
|
HARBOR_TARGET_NODE="${fallback}"
|
||||||
|
}
|
||||||
|
|
||||||
as_array_from_csv() {
|
as_array_from_csv() {
|
||||||
local csv="$1"
|
local csv="$1"
|
||||||
local out_var="$2"
|
local out_var="$2"
|
||||||
@ -557,9 +609,18 @@ harbor_is_ready() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_harbor_pull_canary() {
|
run_harbor_pull_canary() {
|
||||||
local pod="hecate-harbor-canary"
|
local pod="ananke-harbor-canary"
|
||||||
|
local canary_node="${HARBOR_CANARY_NODE}"
|
||||||
|
if ! node_is_ready "${canary_node}"; then
|
||||||
|
ensure_harbor_target_node
|
||||||
|
canary_node="${HARBOR_TARGET_NODE}"
|
||||||
|
if [[ -n "${HARBOR_CANARY_NODE}" ]]; then
|
||||||
|
warn "Configured harbor canary node '${HARBOR_CANARY_NODE}' is not Ready; using '${canary_node}'."
|
||||||
|
fi
|
||||||
|
HARBOR_CANARY_NODE="${canary_node}"
|
||||||
|
fi
|
||||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${HARBOR_CANARY_NODE}"
|
log "DRY-RUN: create Harbor pull canary pod with ${HARBOR_CANARY_IMAGE} on ${canary_node}"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
|
timeout 20 kubectl -n "${NODE_HELPER_NAMESPACE}" delete pod "${pod}" --ignore-not-found --wait=false >/dev/null 2>&1 || true
|
||||||
@ -570,7 +631,7 @@ metadata:
|
|||||||
name: ${pod}
|
name: ${pod}
|
||||||
namespace: ${NODE_HELPER_NAMESPACE}
|
namespace: ${NODE_HELPER_NAMESPACE}
|
||||||
spec:
|
spec:
|
||||||
nodeName: ${HARBOR_CANARY_NODE}
|
nodeName: ${canary_node}
|
||||||
restartPolicy: Never
|
restartPolicy: Never
|
||||||
imagePullSecrets:
|
imagePullSecrets:
|
||||||
- name: ${REGISTRY_PULL_SECRET}
|
- name: ${REGISTRY_PULL_SECRET}
|
||||||
@ -597,7 +658,7 @@ run_helper_pod() {
|
|||||||
local purpose="$2"
|
local purpose="$2"
|
||||||
local timeout_seconds="$3"
|
local timeout_seconds="$3"
|
||||||
local script_content="$4"
|
local script_content="$4"
|
||||||
local pod="hecate-$(sanitize_name "${purpose}")-$(date +%H%M%S)"
|
local pod="ananke-$(sanitize_name "${purpose}")-$(date +%H%M%S)"
|
||||||
local encoded_script
|
local encoded_script
|
||||||
encoded_script="$(printf '%s' "${script_content}" | base64 -w0)"
|
encoded_script="$(printf '%s' "${script_content}" | base64 -w0)"
|
||||||
|
|
||||||
@ -631,9 +692,9 @@ spec:
|
|||||||
command: ["/bin/bash", "-ceu"]
|
command: ["/bin/bash", "-ceu"]
|
||||||
args:
|
args:
|
||||||
- |
|
- |
|
||||||
printf '%s' '${encoded_script}' | base64 -d >/tmp/hecate-step.sh
|
printf '%s' '${encoded_script}' | base64 -d >/tmp/ananke-step.sh
|
||||||
chmod +x /tmp/hecate-step.sh
|
chmod +x /tmp/ananke-step.sh
|
||||||
/tmp/hecate-step.sh
|
/tmp/ananke-step.sh
|
||||||
POD
|
POD
|
||||||
|
|
||||||
if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded "${timeout_seconds}"; then
|
if ! wait_for_pod_phase "${NODE_HELPER_NAMESPACE}" "${pod}" Succeeded "${timeout_seconds}"; then
|
||||||
@ -663,17 +724,36 @@ SCRIPT
|
|||||||
run_helper_pod "${node}" "${purpose}" "${timeout_seconds}" "${script_content}"
|
run_helper_pod "${node}" "${purpose}" "${timeout_seconds}" "${script_content}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
run_host_command_via_prewarm_pod() {
|
||||||
|
local node="$1"
|
||||||
|
local host_command="$2"
|
||||||
|
local pod encoded_command
|
||||||
|
pod="$(kubectl -n "${NODE_HELPER_NAMESPACE}" get pods -l app="${NODE_HELPER_PREWARM_DS}" --field-selector "spec.nodeName=${node}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
|
||||||
|
if [[ -z "${pod}" ]]; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
encoded_command="$(printf '%s' "${host_command}" | base64 -w0)"
|
||||||
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
|
log "DRY-RUN: helper exec via ${pod} on ${node}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
run kubectl -n "${NODE_HELPER_NAMESPACE}" exec "${pod}" -- /bin/bash -ceu "HOST_COMMAND=\$(printf '%s' '${encoded_command}' | base64 -d); nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu \"\${HOST_COMMAND}\""
|
||||||
|
}
|
||||||
|
|
||||||
schedule_host_shutdown_via_helper() {
|
schedule_host_shutdown_via_helper() {
|
||||||
local node="$1"
|
local node="$1"
|
||||||
local service_name="$2"
|
local service_name="$2"
|
||||||
local delay_seconds="$3"
|
local delay_seconds="$3"
|
||||||
local host_command
|
local host_command
|
||||||
host_command="/usr/bin/systemd-run --unit hecate-shutdown-${service_name} --on-active=${delay_seconds}s /bin/sh -lc '/usr/bin/systemctl stop ${service_name} || true; /usr/bin/systemctl poweroff || true'"
|
host_command="/usr/bin/systemd-run --unit ananke-shutdown-${service_name} --on-active=${delay_seconds}s /bin/sh -lc '/usr/bin/systemctl stop ${service_name} || true; /usr/bin/systemctl poweroff || true'"
|
||||||
|
if run_host_command_via_prewarm_pod "${node}" "${host_command}"; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
run_host_command_via_helper "${node}" "shutdown-${node}-${service_name}" 120 "${host_command}"
|
run_host_command_via_helper "${node}" "shutdown-${node}-${service_name}" 120 "${host_command}"
|
||||||
}
|
}
|
||||||
|
|
||||||
prewarm_node_helper_image() {
|
prewarm_node_helper_image() {
|
||||||
local name="hecate-node-helper-prewarm"
|
local name="${NODE_HELPER_PREWARM_DS}"
|
||||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
log "DRY-RUN: prewarm ${NODE_HELPER_IMAGE} via temporary DaemonSet"
|
log "DRY-RUN: prewarm ${NODE_HELPER_IMAGE} via temporary DaemonSet"
|
||||||
return 0
|
return 0
|
||||||
@ -711,7 +791,11 @@ DS
|
|||||||
[[ -n "${ready}" ]] || ready=0
|
[[ -n "${ready}" ]] || ready=0
|
||||||
if [[ "${desired}" != "0" && "${desired}" == "${ready}" ]]; then
|
if [[ "${desired}" != "0" && "${desired}" == "${ready}" ]]; then
|
||||||
log "node-helper-prewarm=${ready}/${desired}"
|
log "node-helper-prewarm=${ready}/${desired}"
|
||||||
kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${name}" --ignore-not-found >/dev/null 2>&1 || true
|
if [[ "${KEEP_PREWARM_DAEMONSET}" -eq 0 ]]; then
|
||||||
|
kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${name}" --ignore-not-found >/dev/null 2>&1 || true
|
||||||
|
else
|
||||||
|
log "Keeping ${name} DaemonSet running for shutdown helper exec path."
|
||||||
|
fi
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
sleep 2
|
sleep 2
|
||||||
@ -722,6 +806,14 @@ DS
|
|||||||
die "Timed out prewarming node helper image ${NODE_HELPER_IMAGE}"
|
die "Timed out prewarming node helper image ${NODE_HELPER_IMAGE}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cleanup_prewarm_daemonset() {
|
||||||
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
|
log "DRY-RUN: cleanup ${NODE_HELPER_PREWARM_DS} DaemonSet"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
kubectl -n "${NODE_HELPER_NAMESPACE}" delete ds "${NODE_HELPER_PREWARM_DS}" --ignore-not-found >/dev/null 2>&1 || true
|
||||||
|
}
|
||||||
|
|
||||||
start_bundle_server() {
|
start_bundle_server() {
|
||||||
[[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle not found at ${HARBOR_BUNDLE_FILE}"
|
[[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle not found at ${HARBOR_BUNDLE_FILE}"
|
||||||
require_cmd python3
|
require_cmd python3
|
||||||
@ -732,7 +824,7 @@ start_bundle_server() {
|
|||||||
log "DRY-RUN: serve ${bundle_name} from ${bundle_dir} on port ${BUNDLE_HTTP_PORT}"
|
log "DRY-RUN: serve ${bundle_name} from ${bundle_dir} on port ${BUNDLE_HTTP_PORT}"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
python3 -m http.server "${BUNDLE_HTTP_PORT}" --bind 0.0.0.0 --directory "${bundle_dir}" </dev/null >/tmp/hecate-bundle-server.log 2>&1 &
|
python3 -m http.server "${BUNDLE_HTTP_PORT}" --bind 0.0.0.0 --directory "${bundle_dir}" </dev/null >/tmp/ananke-bundle-server.log 2>&1 &
|
||||||
BUNDLE_SERVER_PID=$!
|
BUNDLE_SERVER_PID=$!
|
||||||
for _ in $(seq 1 20); do
|
for _ in $(seq 1 20); do
|
||||||
if curl -fsS "http://127.0.0.1:${BUNDLE_HTTP_PORT}/${bundle_name}" >/dev/null 2>&1; then
|
if curl -fsS "http://127.0.0.1:${BUNDLE_HTTP_PORT}/${bundle_name}" >/dev/null 2>&1; then
|
||||||
@ -740,7 +832,7 @@ start_bundle_server() {
|
|||||||
fi
|
fi
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
die "Temporary bundle server did not become ready; see /tmp/hecate-bundle-server.log"
|
die "Temporary bundle server did not become ready; see /tmp/ananke-bundle-server.log"
|
||||||
}
|
}
|
||||||
|
|
||||||
stop_bundle_server() {
|
stop_bundle_server() {
|
||||||
@ -762,6 +854,7 @@ control_host_ip() {
|
|||||||
seed_harbor_images() {
|
seed_harbor_images() {
|
||||||
local images_text control_ip bundle_name script_content seed_rc=0
|
local images_text control_ip bundle_name script_content seed_rc=0
|
||||||
[[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle not found at ${HARBOR_BUNDLE_FILE}"
|
[[ -f "${HARBOR_BUNDLE_FILE}" ]] || die "Harbor bundle not found at ${HARBOR_BUNDLE_FILE}"
|
||||||
|
ensure_harbor_target_node
|
||||||
images_text="$(sed '/^[[:space:]]*#/d;/^[[:space:]]*$/d' "${BOOTSTRAP_DIR}/harbor-bootstrap-images.txt")"
|
images_text="$(sed '/^[[:space:]]*#/d;/^[[:space:]]*$/d' "${BOOTSTRAP_DIR}/harbor-bootstrap-images.txt")"
|
||||||
[[ -n "${images_text}" ]] || die "No Harbor images listed in ${BOOTSTRAP_DIR}/harbor-bootstrap-images.txt"
|
[[ -n "${images_text}" ]] || die "No Harbor images listed in ${BOOTSTRAP_DIR}/harbor-bootstrap-images.txt"
|
||||||
bundle_name="$(basename "${HARBOR_BUNDLE_FILE}")"
|
bundle_name="$(basename "${HARBOR_BUNDLE_FILE}")"
|
||||||
@ -839,15 +932,25 @@ resume_flux_and_reconcile() {
|
|||||||
|
|
||||||
status_report() {
|
status_report() {
|
||||||
local battery flux_ready harbor_code workers
|
local battery flux_ready harbor_code workers
|
||||||
|
local effective_target effective_canary
|
||||||
battery="$(read_ups_battery || true)"
|
battery="$(read_ups_battery || true)"
|
||||||
flux_ready="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
flux_ready="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
||||||
harbor_code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)"
|
harbor_code="$(curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true)"
|
||||||
workers="$(discover_workers_csv 2>/dev/null || true)"
|
workers="$(discover_workers_csv 2>/dev/null || true)"
|
||||||
|
effective_target="${HARBOR_TARGET_NODE}"
|
||||||
|
if ! node_is_ready "${effective_target}"; then
|
||||||
|
effective_target="$(select_ready_arm64_worker || true)"
|
||||||
|
fi
|
||||||
|
effective_canary="${HARBOR_CANARY_NODE}"
|
||||||
|
if ! node_is_ready "${effective_canary}"; then
|
||||||
|
effective_canary="${effective_target}"
|
||||||
|
fi
|
||||||
echo "mode=status"
|
echo "mode=status"
|
||||||
echo "bundle_file=${HARBOR_BUNDLE_FILE}"
|
echo "bundle_file=${HARBOR_BUNDLE_FILE}"
|
||||||
echo "bundle_present=$([[ -f "${HARBOR_BUNDLE_FILE}" ]] && echo true || echo false)"
|
echo "bundle_present=$([[ -f "${HARBOR_BUNDLE_FILE}" ]] && echo true || echo false)"
|
||||||
echo "node_helper_image=${NODE_HELPER_IMAGE}"
|
echo "node_helper_image=${NODE_HELPER_IMAGE}"
|
||||||
echo "harbor_target_node=${HARBOR_TARGET_NODE}"
|
echo "harbor_target_node=${effective_target:-unknown}"
|
||||||
|
echo "harbor_canary_node=${effective_canary:-unknown}"
|
||||||
echo "workers=${workers}"
|
echo "workers=${workers}"
|
||||||
echo "recovery_pending=${RECOVERY_PENDING}"
|
echo "recovery_pending=${RECOVERY_PENDING}"
|
||||||
echo "startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}"
|
echo "startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}"
|
||||||
@ -876,6 +979,7 @@ planned_shutdown() {
|
|||||||
save_recovery_state 1 0 shutdown_started
|
save_recovery_state 1 0 shutdown_started
|
||||||
|
|
||||||
if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
|
if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
|
||||||
|
KEEP_PREWARM_DAEMONSET=1
|
||||||
prewarm_node_helper_image
|
prewarm_node_helper_image
|
||||||
mark_checkpoint shutdown_helper_prewarmed
|
mark_checkpoint shutdown_helper_prewarmed
|
||||||
fi
|
fi
|
||||||
@ -911,6 +1015,9 @@ planned_shutdown() {
|
|||||||
[[ -z "${node}" ]] && continue
|
[[ -z "${node}" ]] && continue
|
||||||
schedule_host_shutdown_via_helper "${node}" k3s 45
|
schedule_host_shutdown_via_helper "${node}" k3s 45
|
||||||
done
|
done
|
||||||
|
if [[ "${SKIP_HELPER_PREWARM}" -eq 0 ]]; then
|
||||||
|
cleanup_prewarm_daemonset
|
||||||
|
fi
|
||||||
mark_checkpoint shutdown_control_planes_scheduled
|
mark_checkpoint shutdown_control_planes_scheduled
|
||||||
log "Shutdown actions scheduled on hosts."
|
log "Shutdown actions scheduled on hosts."
|
||||||
}
|
}
|
||||||
@ -1022,6 +1129,8 @@ log "mode=${MODE} execute=${EXECUTE}"
|
|||||||
log "recovery-state-file=${RECOVERY_STATE_FILE}"
|
log "recovery-state-file=${RECOVERY_STATE_FILE}"
|
||||||
log "bundle-file=${HARBOR_BUNDLE_FILE}"
|
log "bundle-file=${HARBOR_BUNDLE_FILE}"
|
||||||
log "node-helper-image=${NODE_HELPER_IMAGE}"
|
log "node-helper-image=${NODE_HELPER_IMAGE}"
|
||||||
|
log "harbor-target-node-config=${HARBOR_TARGET_NODE:-auto}"
|
||||||
|
log "harbor-canary-node-config=${HARBOR_CANARY_NODE:-auto}"
|
||||||
report_flux_source_state
|
report_flux_source_state
|
||||||
|
|
||||||
case "${MODE}" in
|
case "${MODE}" in
|
||||||
|
|||||||
@ -41,6 +41,7 @@ spec:
|
|||||||
ingress:
|
ingress:
|
||||||
className: traefik
|
className: traefik
|
||||||
annotations:
|
annotations:
|
||||||
|
cert-manager.io/cluster-issuer: letsencrypt
|
||||||
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
||||||
traefik.ingress.kubernetes.io/router.tls: "true"
|
traefik.ingress.kubernetes.io/router.tls: "true"
|
||||||
hosts:
|
hosts:
|
||||||
@ -77,8 +78,6 @@ spec:
|
|||||||
image:
|
image:
|
||||||
repository: registry.bstein.dev/infra/harbor-redis
|
repository: registry.bstein.dev/infra/harbor-redis
|
||||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"}
|
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"}
|
||||||
nodeSelector:
|
|
||||||
kubernetes.io/hostname: titan-05
|
|
||||||
affinity:
|
affinity:
|
||||||
nodeAffinity:
|
nodeAffinity:
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
@ -113,8 +112,6 @@ spec:
|
|||||||
image:
|
image:
|
||||||
repository: registry.bstein.dev/infra/harbor-core
|
repository: registry.bstein.dev/infra/harbor-core
|
||||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"}
|
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"}
|
||||||
nodeSelector:
|
|
||||||
kubernetes.io/hostname: titan-05
|
|
||||||
serviceAccountName: harbor-vault-sync
|
serviceAccountName: harbor-vault-sync
|
||||||
automountServiceAccountToken: true
|
automountServiceAccountToken: true
|
||||||
existingSecret: harbor-core
|
existingSecret: harbor-core
|
||||||
@ -174,8 +171,6 @@ spec:
|
|||||||
image:
|
image:
|
||||||
repository: registry.bstein.dev/infra/harbor-jobservice
|
repository: registry.bstein.dev/infra/harbor-jobservice
|
||||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"}
|
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"}
|
||||||
nodeSelector:
|
|
||||||
kubernetes.io/hostname: titan-05
|
|
||||||
serviceAccountName: harbor-vault-sync
|
serviceAccountName: harbor-vault-sync
|
||||||
automountServiceAccountToken: true
|
automountServiceAccountToken: true
|
||||||
existingSecret: harbor-jobservice
|
existingSecret: harbor-jobservice
|
||||||
@ -216,8 +211,6 @@ spec:
|
|||||||
image:
|
image:
|
||||||
repository: registry.bstein.dev/infra/harbor-portal
|
repository: registry.bstein.dev/infra/harbor-portal
|
||||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"}
|
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"}
|
||||||
nodeSelector:
|
|
||||||
kubernetes.io/hostname: titan-05
|
|
||||||
affinity:
|
affinity:
|
||||||
nodeAffinity:
|
nodeAffinity:
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
@ -294,8 +287,6 @@ spec:
|
|||||||
{{- with secret "kv/data/atlas/harbor/harbor-registry-htpasswd" -}}
|
{{- with secret "kv/data/atlas/harbor/harbor-registry-htpasswd" -}}
|
||||||
{{ .Data.data.REGISTRY_HTPASSWD }}
|
{{ .Data.data.REGISTRY_HTPASSWD }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
nodeSelector:
|
|
||||||
kubernetes.io/hostname: titan-05
|
|
||||||
affinity:
|
affinity:
|
||||||
nodeAffinity:
|
nodeAffinity:
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
@ -321,8 +312,6 @@ spec:
|
|||||||
image:
|
image:
|
||||||
repository: registry.bstein.dev/infra/harbor-nginx
|
repository: registry.bstein.dev/infra/harbor-nginx
|
||||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"}
|
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"}
|
||||||
nodeSelector:
|
|
||||||
kubernetes.io/hostname: titan-05
|
|
||||||
affinity:
|
affinity:
|
||||||
nodeAffinity:
|
nodeAffinity:
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user