From 0c2b59f7ccf4c7104b49e8aba16c9495a28c4da2 Mon Sep 17 00:00:00 2001 From: jenkins Date: Thu, 18 Jun 2026 18:20:22 -0300 Subject: [PATCH] recovery(ananke): avoid unnecessary longhorn sidecar churn --- scripts/cluster_power_recovery.sh | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/scripts/cluster_power_recovery.sh b/scripts/cluster_power_recovery.sh index eb8d1451..fedec2ab 100755 --- a/scripts/cluster_power_recovery.sh +++ b/scripts/cluster_power_recovery.sh @@ -1048,6 +1048,31 @@ remove_longhorn_manager_prepull_sidecar() { done <<< "${indexes}" } +longhorn_manager_prepull_sidecar_has_pull_failures() { + kubectl -n longhorn-system get pods -l app=longhorn-manager -o json \ + | jq -e ' + [ + .items[].status.containerStatuses[]? + | select(.name == "pre-pull-share-manager-image") + | select(((.state.waiting.reason // "") | test("ImagePullBackOff|ErrImagePull|CreateContainerError|RunContainerError|InvalidImageName"))) + ] + | length > 0' >/dev/null 2>&1 +} + +remove_longhorn_manager_prepull_sidecar_if_needed() { + if ! harbor_endpoint_is_ready 1; then + warn "Removing Longhorn manager pre-pull sidecar because Harbor registry API is unhealthy." + remove_longhorn_manager_prepull_sidecar + return 0 + fi + if longhorn_manager_prepull_sidecar_has_pull_failures; then + warn "Removing Longhorn manager pre-pull sidecar because it is in image/runtime failure." + remove_longhorn_manager_prepull_sidecar + return 0 + fi + log "longhorn-manager-prepull-sidecar=retained harbor=healthy pull_failures=false" +} + save_longhorn_unlock_optional_replica_snapshot() { if [[ "${EXECUTE}" -eq 0 ]]; then log "DRY-RUN: save optional workload snapshot to ${LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE}" @@ -2984,7 +3009,7 @@ longhorn_unlock_flow() { REFRESH_BOOTSTRAP_IMAGE_ALIASES=1 freeze_longhorn_deadlock_automation ensure_longhorn_cache_first_policy - remove_longhorn_manager_prepull_sidecar + remove_longhorn_manager_prepull_sidecar_if_needed free_longhorn_instance_manager_headroom delete_failed_nonstorage_pods_for_headroom repair_longhorn_manager_cache_deadlock || warn "Surgical Longhorn manager cache repair did not complete on every affected node."