recovery(ananke): handle longhorn harbor deadlock

This commit is contained in:
jenkins 2026-06-18 18:02:32 -03:00
parent cf5caedd56
commit 0f58aa16a9
6 changed files with 1219 additions and 54 deletions

View File

@ -37,7 +37,7 @@ spec:
createSecret: false
registrySecret: longhorn-registry
image:
pullPolicy: Always
pullPolicy: IfNotPresent
longhorn:
engine:
repository: registry.bstein.dev/infra/longhorn-engine
@ -80,7 +80,7 @@ spec:
repository: registry.bstein.dev/infra/longhorn-livenessprobe
tag: v2.16.0
defaultSettings:
systemManagedPodsImagePullPolicy: Always
systemManagedPodsImagePullPolicy: if-not-present
taintToleration: veles.bstein.dev/simulation=true:NoSchedule
longhornManager:
tolerations:

View File

@ -1,4 +1,4 @@
# Harbor cold-start bootstrap images.
# Harbor and Longhorn cold-start bootstrap images.
registry.bstein.dev/infra/harbor-core:v2.14.1-arm64
registry.bstein.dev/infra/harbor-jobservice:v2.14.1-arm64
registry.bstein.dev/infra/harbor-portal:v2.14.1-arm64
@ -7,3 +7,18 @@ registry.bstein.dev/infra/harbor-registryctl:v2.14.1-arm64
registry.bstein.dev/infra/harbor-redis:v2.14.1-arm64
registry.bstein.dev/infra/harbor-nginx:v2.14.1-arm64
registry.bstein.dev/infra/harbor-prepare:v2.14.1-arm64
# Longhorn must be able to start before Harbor is fully healthy.
registry.bstein.dev/infra/longhorn-engine:v1.8.2
registry.bstein.dev/infra/longhorn-manager:v1.8.2
registry.bstein.dev/infra/longhorn-ui:v1.8.2
registry.bstein.dev/infra/longhorn-instance-manager:v1.8.2
registry.bstein.dev/infra/longhorn-share-manager:v1.8.2
registry.bstein.dev/infra/longhorn-backing-image-manager:v1.8.2
registry.bstein.dev/infra/longhorn-support-bundle-kit:v0.0.56
registry.bstein.dev/infra/longhorn-csi-attacher:v4.9.0
registry.bstein.dev/infra/longhorn-csi-provisioner:v5.3.0
registry.bstein.dev/infra/longhorn-csi-node-driver-registrar:v2.14.0
registry.bstein.dev/infra/longhorn-csi-resizer:v1.13.2
registry.bstein.dev/infra/longhorn-csi-snapshotter:v8.2.0
registry.bstein.dev/infra/longhorn-livenessprobe:v2.16.0

View File

@ -0,0 +1,14 @@
# Longhorn images needed when Harbor is unhealthy during storage recovery.
registry.bstein.dev/infra/longhorn-engine:v1.8.2
registry.bstein.dev/infra/longhorn-manager:v1.8.2
registry.bstein.dev/infra/longhorn-ui:v1.8.2
registry.bstein.dev/infra/longhorn-instance-manager:v1.8.2
registry.bstein.dev/infra/longhorn-share-manager:v1.8.2
registry.bstein.dev/infra/longhorn-backing-image-manager:v1.8.2
registry.bstein.dev/infra/longhorn-support-bundle-kit:v0.0.56
registry.bstein.dev/infra/longhorn-csi-attacher:v4.9.0
registry.bstein.dev/infra/longhorn-csi-provisioner:v5.3.0
registry.bstein.dev/infra/longhorn-csi-node-driver-registrar:v2.14.0
registry.bstein.dev/infra/longhorn-csi-resizer:v1.13.2
registry.bstein.dev/infra/longhorn-csi-snapshotter:v8.2.0
registry.bstein.dev/infra/longhorn-livenessprobe:v2.16.0

View File

@ -4,7 +4,9 @@ EXPECTED_FLUX_URL="ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
SHUTDOWN_MODE="host-poweroff"
STATE_SUBDIR=".local/share/ananke"
HARBOR_BUNDLE_BASENAME="harbor-bootstrap-v2.14.1-arm64.tar.zst"
HARBOR_TARGET_NODE=""
BOOTSTRAP_BUNDLE_ARCH="arm64"
RECOVERY_UNCORDON_DENYLIST="titan-18,titan-22,titan-24"
HARBOR_TARGET_NODE="titan-11"
HARBOR_CANARY_NODE=""
HARBOR_HOST_LABEL_KEY="ananke.bstein.dev/harbor-bootstrap"
HARBOR_CANARY_IMAGE="registry.bstein.dev/bstein/kubectl:1.35.0"
@ -33,4 +35,4 @@ STARTUP_INCLUDE_INGRESS_CHECKS="1"
STARTUP_INGRESS_ALLOWED_STATUSES="200,301,302,307,308,401,403,404"
STARTUP_IGNORE_INGRESS_HOSTS_REGEX=""
STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="10"
STARTUP_SERVICE_CHECKLIST='gitea|https://scm.bstein.dev/api/healthz|200|"status":"pass"||;grafana|https://metrics.bstein.dev/api/health|200|"database":"ok"||;harbor|https://registry.bstein.dev/v2/|200,401|||'
STARTUP_SERVICE_CHECKLIST='gitea|https://scm.bstein.dev/api/healthz|200|"status":"pass"||;grafana|https://metrics.bstein.dev/api/health|200|"database":"ok"||;harbor|https://registry.bstein.dev/v2/|401|unauthorized|<html|'

View File

@ -5,6 +5,7 @@ IMAGES_FILE="scripts/bootstrap/harbor-bootstrap-images.txt"
BUNDLE_FILE="artifacts/harbor-bootstrap-v2.14.1-arm64.tar.zst"
DOCKER_CONFIG_PATH=""
PLATFORM="linux/arm64"
ZSTD_LEVEL="${ZSTD_LEVEL:-19}"
while [[ $# -gt 0 ]]; do
case "$1" in
@ -24,9 +25,13 @@ while [[ $# -gt 0 ]]; do
PLATFORM="${2:?missing platform}"
shift 2
;;
--zstd-level)
ZSTD_LEVEL="${2:?missing zstd compression level}"
shift 2
;;
-h|--help)
cat <<USAGE
Usage: scripts/build_harbor_bootstrap_bundle.sh [--images-file <path>] [--bundle-file <path>] [--docker-config <path>] [--platform <linux/arm64>]
Usage: scripts/build_harbor_bootstrap_bundle.sh [--images-file <path>] [--bundle-file <path>] [--docker-config <path>] [--platform <linux/arm64>] [--zstd-level <level>]
USAGE
exit 0
;;
@ -47,12 +52,54 @@ if [[ ${#IMAGES[@]} -eq 0 ]]; then
exit 1
fi
source_image_for_alias() {
local image="$1"
local tag="${image##*:}"
case "${image}" in
registry.bstein.dev/infra/longhorn-engine:*) echo "docker.io/longhornio/longhorn-engine:${tag}" ;;
registry.bstein.dev/infra/longhorn-manager:*) echo "docker.io/longhornio/longhorn-manager:${tag}" ;;
registry.bstein.dev/infra/longhorn-ui:*) echo "docker.io/longhornio/longhorn-ui:${tag}" ;;
registry.bstein.dev/infra/longhorn-instance-manager:*) echo "docker.io/longhornio/longhorn-instance-manager:${tag}" ;;
registry.bstein.dev/infra/longhorn-share-manager:*) echo "docker.io/longhornio/longhorn-share-manager:${tag}" ;;
registry.bstein.dev/infra/longhorn-backing-image-manager:*) echo "docker.io/longhornio/backing-image-manager:${tag}" ;;
registry.bstein.dev/infra/longhorn-support-bundle-kit:*) echo "docker.io/longhornio/support-bundle-kit:${tag}" ;;
registry.bstein.dev/infra/longhorn-csi-attacher:*) echo "registry.k8s.io/sig-storage/csi-attacher:${tag}" ;;
registry.bstein.dev/infra/longhorn-csi-provisioner:*) echo "registry.k8s.io/sig-storage/csi-provisioner:${tag}" ;;
registry.bstein.dev/infra/longhorn-csi-node-driver-registrar:*) echo "registry.k8s.io/sig-storage/csi-node-driver-registrar:${tag}" ;;
registry.bstein.dev/infra/longhorn-csi-resizer:*) echo "registry.k8s.io/sig-storage/csi-resizer:${tag}" ;;
registry.bstein.dev/infra/longhorn-csi-snapshotter:*) echo "registry.k8s.io/sig-storage/csi-snapshotter:${tag}" ;;
registry.bstein.dev/infra/longhorn-livenessprobe:*) echo "registry.k8s.io/sig-storage/livenessprobe:${tag}" ;;
*) echo "${image}" ;;
esac
}
pull_or_tag_image() {
local image="$1"
local source_image
if docker image inspect "${image}" >/dev/null 2>&1; then
echo "Using cached ${image}" >&2
return 0
fi
echo "Pulling ${image}" >&2
if docker pull --platform "${PLATFORM}" "${image}" >/dev/null; then
return 0
fi
source_image="$(source_image_for_alias "${image}")"
if [[ "${source_image}" == "${image}" ]]; then
return 1
fi
echo "Pulling ${source_image} for ${image}" >&2
docker pull --platform "${PLATFORM}" "${source_image}" >/dev/null
docker tag "${source_image}" "${image}"
}
mkdir -p "$(dirname "${BUNDLE_FILE}")"
for image in "${IMAGES[@]}"; do
echo "Pulling ${image}" >&2
docker pull --platform "${PLATFORM}" "${image}" >/dev/null
pull_or_tag_image "${image}"
done
docker save "${IMAGES[@]}" | zstd -T0 -19 -o "${BUNDLE_FILE}"
tmp_bundle="${BUNDLE_FILE}.tmp"
rm -f "${tmp_bundle}"
docker save "${IMAGES[@]}" | zstd -T0 -"${ZSTD_LEVEL}" -o "${tmp_bundle}"
mv "${tmp_bundle}" "${BUNDLE_FILE}"
echo "Wrote ${BUNDLE_FILE}" >&2

File diff suppressed because it is too large Load Diff