2026-04-06 00:22:54 -03:00
#!/usr/bin/env bash
set -euo pipefail
2026-04-06 04:47:05 -03:00
SCRIPT_DIR = " $( cd " $( dirname " ${ BASH_SOURCE [0] } " ) " && pwd ) "
2026-04-06 21:27:23 -03:00
REPO_DIR = " ${ ANANKE_REPO_DIR :- $( cd " ${ SCRIPT_DIR } /.. " && pwd ) } "
2026-04-06 04:47:05 -03:00
BOOTSTRAP_DIR = " ${ SCRIPT_DIR } /bootstrap "
CONFIG_FILE = " ${ BOOTSTRAP_DIR } /recovery-config.env "
if [ [ -f " ${ CONFIG_FILE } " ] ] ; then
# shellcheck disable=SC1090
source " ${ CONFIG_FILE } "
fi
if [ [ -z " ${ KUBECONFIG :- } " && -f " ${ SCRIPT_DIR } /kubeconfig " ] ] ; then
export KUBECONFIG = " ${ SCRIPT_DIR } /kubeconfig "
fi
2026-04-06 00:22:54 -03:00
usage( ) {
2026-04-06 04:47:05 -03:00
cat <<USAGE
2026-04-06 00:22:54 -03:00
Usage:
2026-06-18 18:02:32 -03:00
scripts/cluster_power_recovery.sh <prepare| status| bootstrap-seed| harbor-seed| longhorn-seed| longhorn-unlock| shutdown| startup> [ options]
2026-04-06 00:22:54 -03:00
Options:
--execute Actually run commands ( default is dry-run)
2026-04-07 12:30:28 -03:00
--shutdown-mode <mode> Shutdown behavior: host-poweroff or cluster-only ( default: ${ SHUTDOWN_MODE :- host -poweroff } )
2026-04-06 04:47:05 -03:00
--expected-flux-branch <name> Expected Flux source branch during startup checks ( default: ${ DEFAULT_FLUX_BRANCH :- main } )
2026-04-07 12:30:28 -03:00
--expected-flux-url <url> Expected Flux source URL during startup checks
--allow-flux-source-mutation Required to allow --force-flux-url during startup
--force-flux-url <url> Startup: patch flux-system GitRepository URL to this value
2026-04-06 04:47:05 -03:00
--force-flux-branch <name> Startup: patch flux-system GitRepository branch to this value
--skip-etcd-snapshot Shutdown: skip etcd snapshot before shutdown
--skip-drain Shutdown: skip worker drain during shutdown
2026-04-06 00:22:54 -03:00
--skip-local-bootstrap Startup: skip local bootstrap fallback applies
--skip-harbor-bootstrap Startup: skip Harbor recovery bootstrap stage
2026-06-18 18:02:32 -03:00
--skip-harbor-seed Startup: skip bootstrap image seed/import stage
2026-04-06 04:47:05 -03:00
--skip-helper-prewarm Prepare/Shutdown/Startup: skip node-helper prewarm
2026-06-18 18:02:32 -03:00
--refresh-bootstrap-image-aliases
Remove bootstrap image aliases before import, to clear poisoned registry pulls
2026-04-06 00:22:54 -03:00
--min-startup-battery <pct> Minimum UPS percent required before bootstrap ( default: 35)
--ups-host <name> UPS identifier for upsc ( default: ups@localhost)
--ups-battery-key <key> UPS battery key for upsc ( default: battery.charge)
2026-04-06 04:47:05 -03:00
--recovery-state-file <path> Recovery state file for outage-aware restart logic
2026-04-07 12:30:28 -03:00
--replica-snapshot-file <path>
File used to persist workload replica snapshot across shutdown/startup
2026-06-18 18:02:32 -03:00
--bootstrap-images-file <path>
Image list expected inside the bootstrap bundle
--harbor-bundle-file <path> Bootstrap bundle on the control host
--longhorn-unlock-bundle-file <path>
Longhorn-only bundle for Harbor-deadlock recovery
--longhorn-unlock-images-file <path>
Longhorn-only image list for Harbor-deadlock recovery
--longhorn-manager-cache-bundle-file <path>
Single-image Longhorn manager cache repair archive
--skip-longhorn-unlock-bundle-seed
Longhorn unlock: skip full Longhorn bundle seed and run surgical repairs only
--bootstrap-bundle-arch <arch>
Node architecture expected by the bootstrap bundle ( default: ${ BOOTSTRAP_BUNDLE_ARCH :- arm64 } )
2026-04-06 21:27:23 -03:00
--harbor-target-node <name> Node that should host Harbor during bootstrap ( default: auto)
--harbor-canary-node <name> Node used for Harbor pull canary ( default: auto)
2026-04-06 21:32:43 -03:00
--harbor-host-label-key <key> Node label key used to pin Harbor bootstrap workloads ( default: ${ HARBOR_HOST_LABEL_KEY :- ananke .bstein.dev/harbor-bootstrap } )
2026-04-06 04:47:05 -03:00
--harbor-canary-image <image> Harbor-backed image used for pull canary ( default: ${ HARBOR_CANARY_IMAGE :- registry .bstein.dev/bstein/kubectl : 1 .35.0 } )
2026-04-06 21:27:23 -03:00
--node-helper-image <image> Privileged helper image used for host operations ( default: ${ NODE_HELPER_IMAGE :- registry .bstein.dev/bstein/ananke-node-helper : 0 .1.0 } )
2026-04-06 04:47:05 -03:00
--bundle-http-port <port> Temporary HTTP port used to serve bootstrap bundles ( default: ${ BUNDLE_HTTP_PORT :- 8877 } )
--api-wait-timeout <seconds> Startup: Kubernetes API wait timeout ( default: 600)
2026-04-06 00:22:54 -03:00
--drain-timeout <seconds> Worker drain timeout for normal shutdown ( default: 180)
--emergency-drain-timeout <seconds>
Worker drain timeout for emergency fallback ( default: 45)
2026-04-07 12:30:28 -03:00
--flux-ready-timeout <seconds>
Startup: max time to wait for Flux kustomizations Ready ( default: 1200)
--startup-checklist-timeout <seconds>
Startup: max time to wait for external service checklist ( default: 900)
--startup-workload-timeout <seconds>
Startup: max time to wait for workload readiness checks ( default: 900)
--startup-stability-window <seconds>
Startup: continuous healthy window required before success ( default: 180)
--startup-stability-timeout <seconds>
Startup: max time allowed to achieve the healthy window ( default: 900)
2026-04-06 00:22:54 -03:00
--require-ups-battery Hard-fail startup if UPS battery cannot be read
-h, --help Show help
Examples:
2026-04-06 04:47:05 -03:00
scripts/cluster_power_recovery.sh prepare --execute
2026-06-18 18:02:32 -03:00
scripts/cluster_power_recovery.sh bootstrap-seed --execute
2026-04-06 04:47:05 -03:00
scripts/cluster_power_recovery.sh harbor-seed --execute
2026-06-18 18:02:32 -03:00
scripts/cluster_power_recovery.sh longhorn-unlock --execute
2026-04-06 04:47:05 -03:00
scripts/cluster_power_recovery.sh status
2026-04-06 00:22:54 -03:00
scripts/cluster_power_recovery.sh shutdown --execute
scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main
USAGE
}
MODE = " ${ 1 :- } "
if [ [ -z " ${ MODE } " || " ${ MODE } " = = "-h" || " ${ MODE } " = = "--help" ] ] ; then
usage
exit 0
fi
shift || true
2026-04-06 04:47:05 -03:00
case " ${ MODE } " in
2026-06-18 18:02:32 -03:00
prepare| status| bootstrap-seed| harbor-seed| longhorn-seed| longhorn-unlock| shutdown| startup) ; ;
2026-04-06 04:47:05 -03:00
*)
echo " Unknown mode: ${ MODE } " >& 2
usage
exit 1
; ;
esac
2026-04-06 00:22:54 -03:00
EXECUTE = 0
2026-04-07 12:30:28 -03:00
SHUTDOWN_MODE = " ${ SHUTDOWN_MODE :- host -poweroff } "
2026-04-06 04:47:05 -03:00
EXPECTED_FLUX_BRANCH = " ${ DEFAULT_FLUX_BRANCH :- main } "
2026-04-07 12:30:28 -03:00
EXPECTED_FLUX_URL = " ${ EXPECTED_FLUX_URL :- ssh : //git@scm.bstein.dev : 2242 /bstein/titan-iac.git } "
ALLOW_FLUX_SOURCE_MUTATION = 0
FORCE_FLUX_URL = ""
2026-04-06 04:47:05 -03:00
FORCE_FLUX_BRANCH = ""
2026-04-06 00:22:54 -03:00
SKIP_ETCD_SNAPSHOT = 0
SKIP_DRAIN = 0
SKIP_LOCAL_BOOTSTRAP = 0
SKIP_HARBOR_BOOTSTRAP = 0
2026-04-06 04:47:05 -03:00
SKIP_HARBOR_SEED = 0
SKIP_HELPER_PREWARM = 0
UPS_HOST = " ${ UPS_HOST :- ups @localhost } "
UPS_BATTERY_KEY = " ${ UPS_BATTERY_KEY :- battery .charge } "
MIN_STARTUP_BATTERY = " ${ MIN_STARTUP_BATTERY :- 35 } "
REQUIRE_UPS_BATTERY = " ${ REQUIRE_UPS_BATTERY :- 0 } "
2026-04-06 00:22:54 -03:00
DRAIN_TIMEOUT_SECONDS = 180
EMERGENCY_DRAIN_TIMEOUT_SECONDS = 45
2026-04-06 04:47:05 -03:00
API_WAIT_TIMEOUT_SECONDS = 600
2026-04-07 12:30:28 -03:00
FLUX_READY_TIMEOUT_SECONDS = " ${ FLUX_READY_TIMEOUT_SECONDS :- 1200 } "
FLUX_READY_POLL_SECONDS = " ${ FLUX_READY_POLL_SECONDS :- 10 } "
STARTUP_CHECKLIST_TIMEOUT_SECONDS = " ${ STARTUP_CHECKLIST_TIMEOUT_SECONDS :- 900 } "
STARTUP_CHECKLIST_POLL_SECONDS = " ${ STARTUP_CHECKLIST_POLL_SECONDS :- 10 } "
STARTUP_WORKLOAD_TIMEOUT_SECONDS = " ${ STARTUP_WORKLOAD_TIMEOUT_SECONDS :- 900 } "
STARTUP_WORKLOAD_POLL_SECONDS = " ${ STARTUP_WORKLOAD_POLL_SECONDS :- 10 } "
STARTUP_STABILITY_WINDOW_SECONDS = " ${ STARTUP_STABILITY_WINDOW_SECONDS :- 180 } "
STARTUP_STABILITY_TIMEOUT_SECONDS = " ${ STARTUP_STABILITY_TIMEOUT_SECONDS :- 900 } "
STARTUP_STABILITY_POLL_SECONDS = " ${ STARTUP_STABILITY_POLL_SECONDS :- 10 } "
STARTUP_IGNORE_PODS_REGEX = " ${ STARTUP_IGNORE_PODS_REGEX :- } "
STARTUP_IGNORE_WORKLOADS_REGEX = " ${ STARTUP_IGNORE_WORKLOADS_REGEX :- } "
STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX = " ${ STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX :- ^(kube-system|kube-public|kube-node-lease|flux-system) $} "
STARTUP_OPTIONAL_KUSTOMIZATIONS = " ${ STARTUP_OPTIONAL_KUSTOMIZATIONS :- } "
2026-06-18 18:35:13 -03:00
RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS = " ${ RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS :- ai -llm,descheduler,finance,game-stream,gitops-ui,health,jellyfin,jenkins,longhorn-ui,mailu,nextcloud,nextcloud-mail-sync,outline,planka,quality,resource-guardrails,typhon,vaultwarden,veles,wallet-monero-temp,xmr-miner } "
RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS = " ${ RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS :- flux -system,core,helm,cert-manager,longhorn-adopt,longhorn,metallb,traefik,vault-csi,vault-injector,vault,postgres,harbor,gitea,keycloak,oauth2-proxy,openldap,openclaw,monitoring,bstein-dev-home,bstein-dev-home-migrations,comms,crypto,logging,maintenance,monerod,sui-metrics } "
RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE = " ${ RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE :- ${ HOME } / ${ STATE_SUBDIR :- .local/share/ananke } /longhorn_unlock_optional_flux.tsv } "
RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER = " ${ RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER :- 1 } "
2026-04-07 12:30:28 -03:00
STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS = " ${ STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS :- 10 } "
STARTUP_SERVICE_CHECKLIST = " ${ STARTUP_SERVICE_CHECKLIST :- } "
STARTUP_INCLUDE_INGRESS_CHECKS = " ${ STARTUP_INCLUDE_INGRESS_CHECKS :- 1 } "
STARTUP_INGRESS_ALLOWED_STATUSES = " ${ STARTUP_INGRESS_ALLOWED_STATUSES :- 200 ,301,302,307,308,401,403,404 } "
STARTUP_IGNORE_INGRESS_HOSTS_REGEX = " ${ STARTUP_IGNORE_INGRESS_HOSTS_REGEX :- } "
STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS = " ${ STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS :- 10 } "
SHUTDOWN_NAMESPACE_EXCLUDES_REGEX = " ${ SHUTDOWN_NAMESPACE_EXCLUDES_REGEX :- ^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance) $} "
2026-04-09 01:41:02 -03:00
REQUIRE_NONEMPTY_REPLICA_SNAPSHOT = " ${ REQUIRE_NONEMPTY_REPLICA_SNAPSHOT :- 1 } "
STARTUP_REQUIRE_MAIL_SAFEGUARDS = " ${ STARTUP_REQUIRE_MAIL_SAFEGUARDS :- 1 } "
MAIL_STARTUP_NAMESPACE = " ${ MAIL_STARTUP_NAMESPACE :- mailu -mailserver } "
MAIL_STARTUP_ENDPOINT_SERVICES = " ${ MAIL_STARTUP_ENDPOINT_SERVICES :- mailu -front,mailu-postfix,mailu-dovecot } "
MAIL_STARTUP_HOST = " ${ MAIL_STARTUP_HOST :- mail .bstein.dev } "
MAIL_STARTUP_TCP_PORTS = " ${ MAIL_STARTUP_TCP_PORTS :- 25 ,465,587,993,995 } "
MAIL_STARTUP_TCP_TIMEOUT_SECONDS = " ${ MAIL_STARTUP_TCP_TIMEOUT_SECONDS :- 3 } "
2026-04-06 04:47:05 -03:00
BUNDLE_HTTP_PORT = " ${ BUNDLE_HTTP_PORT :- 8877 } "
2026-04-06 21:27:23 -03:00
STATE_ROOT = " ${ HOME } / ${ STATE_SUBDIR :- .local/share/ananke } "
2026-04-06 04:47:05 -03:00
RECOVERY_STATE_FILE = " ${ STATE_ROOT } /cluster_power_recovery.state "
2026-04-07 12:30:28 -03:00
REPLICA_SNAPSHOT_FILE = " ${ STATE_ROOT } /desired_workload_replicas.tsv "
2026-04-06 04:47:05 -03:00
HARBOR_BUNDLE_FILE = " ${ STATE_ROOT } /bundles/ ${ HARBOR_BUNDLE_BASENAME :- harbor -bootstrap-v2.14.1-arm64.tar.zst } "
2026-06-18 18:02:32 -03:00
BOOTSTRAP_IMAGES_FILE = " ${ BOOTSTRAP_IMAGES_FILE :- ${ BOOTSTRAP_DIR } /harbor-bootstrap-images.txt } "
LONGHORN_UNLOCK_IMAGES_FILE = " ${ LONGHORN_UNLOCK_IMAGES_FILE :- ${ BOOTSTRAP_DIR } /longhorn-unlock-images.txt } "
LONGHORN_UNLOCK_BUNDLE_FILE = " ${ LONGHORN_UNLOCK_BUNDLE_FILE :- ${ STATE_ROOT } /bundles/longhorn-unlock-v1.8.2- ${ BOOTSTRAP_BUNDLE_ARCH :- arm64 } .tar.zst } "
LONGHORN_MANAGER_IMAGE = " ${ LONGHORN_MANAGER_IMAGE :- registry .bstein.dev/infra/longhorn-manager : v1 .8.2 } "
LONGHORN_MANAGER_CACHE_BUNDLE_FILE = " ${ LONGHORN_MANAGER_CACHE_BUNDLE_FILE :- ${ STATE_ROOT } /bundles/longhorn-manager-v1.8.2- ${ BOOTSTRAP_BUNDLE_ARCH :- arm64 } .tar } "
LONGHORN_UNLOCK_SSH_KNOWN_HOSTS = " ${ LONGHORN_UNLOCK_SSH_KNOWN_HOSTS :- /tmp/ananke_longhorn_unlock_known_hosts } "
BOOTSTRAP_BUNDLE_ARCH = " ${ BOOTSTRAP_BUNDLE_ARCH :- arm64 } "
RECOVERY_UNCORDON_DENYLIST = " ${ RECOVERY_UNCORDON_DENYLIST :- titan -18,titan-22,titan-24 } "
STALE_TERMINATING_POD_SECONDS = " ${ STALE_TERMINATING_POD_SECONDS :- 300 } "
RECOVERY_NODE_RUNTIME_RESTART_ENABLED = " ${ RECOVERY_NODE_RUNTIME_RESTART_ENABLED :- 1 } "
RECOVERY_NODE_RUNTIME_RESTART_DENYLIST = " ${ RECOVERY_NODE_RUNTIME_RESTART_DENYLIST :- ${ RECOVERY_UNCORDON_DENYLIST } } "
RECOVERY_NODE_RUNTIME_RESTART_MAX_NODES = " ${ RECOVERY_NODE_RUNTIME_RESTART_MAX_NODES :- 3 } "
RECOVERY_NODE_RUNTIME_RESTART_WAIT_SECONDS = " ${ RECOVERY_NODE_RUNTIME_RESTART_WAIT_SECONDS :- 300 } "
2026-04-06 21:27:23 -03:00
HARBOR_TARGET_NODE = " ${ HARBOR_TARGET_NODE :- } "
HARBOR_CANARY_NODE = " ${ HARBOR_CANARY_NODE :- } "
2026-04-06 21:32:43 -03:00
HARBOR_HOST_LABEL_KEY = " ${ HARBOR_HOST_LABEL_KEY :- ananke .bstein.dev/harbor-bootstrap } "
2026-04-06 04:47:05 -03:00
HARBOR_CANARY_IMAGE = " ${ HARBOR_CANARY_IMAGE :- registry .bstein.dev/bstein/kubectl : 1 .35.0 } "
2026-04-06 21:27:23 -03:00
NODE_HELPER_IMAGE = " ${ NODE_HELPER_IMAGE :- registry .bstein.dev/bstein/ananke-node-helper : 0 .1.0 } "
2026-04-06 04:47:05 -03:00
NODE_HELPER_NAMESPACE = " ${ NODE_HELPER_NAMESPACE :- maintenance } "
NODE_HELPER_SERVICE_ACCOUNT = " ${ NODE_HELPER_SERVICE_ACCOUNT :- default } "
2026-04-06 21:27:23 -03:00
NODE_HELPER_PREWARM_DS = " ${ NODE_HELPER_PREWARM_DS :- ananke -node-helper-prewarm } "
2026-04-06 04:47:05 -03:00
REGISTRY_PULL_SECRET = " ${ REGISTRY_PULL_SECRET :- harbor -regcred } "
2026-06-18 18:02:32 -03:00
REFRESH_BOOTSTRAP_IMAGE_ALIASES = " ${ REFRESH_BOOTSTRAP_IMAGE_ALIASES :- 0 } "
SKIP_LONGHORN_UNLOCK_BUNDLE_SEED = " ${ SKIP_LONGHORN_UNLOCK_BUNDLE_SEED :- 0 } "
LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE = " ${ LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE :- ${ STATE_ROOT } /longhorn_unlock_optional_replicas.tsv } "
2026-04-06 21:27:23 -03:00
KEEP_PREWARM_DAEMONSET = 0
2026-06-18 18:02:32 -03:00
BOOTSTRAP_IMAGES_SEEDED = 0
2026-04-06 00:22:54 -03:00
RECOVERY_PENDING = 0
STARTUP_ATTEMPTED_DURING_OUTAGE = 0
2026-04-06 04:47:05 -03:00
LAST_CHECKPOINT = "none"
BUNDLE_SERVER_PID = ""
UPS_HOST_IN_USE = ""
2026-04-06 00:22:54 -03:00
while [ [ $# -gt 0 ] ] ; do
case " $1 " in
--execute)
EXECUTE = 1
shift
; ;
2026-04-07 12:30:28 -03:00
--shutdown-mode)
SHUTDOWN_MODE = " ${ 2 : ?missing shutdown mode } "
shift 2
; ;
2026-04-06 04:47:05 -03:00
--expected-flux-branch)
EXPECTED_FLUX_BRANCH = " ${ 2 : ?missing branch } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
2026-04-07 12:30:28 -03:00
--expected-flux-url)
EXPECTED_FLUX_URL = " ${ 2 : ?missing flux url } "
shift 2
; ;
--allow-flux-source-mutation)
ALLOW_FLUX_SOURCE_MUTATION = 1
shift
; ;
--force-flux-url)
FORCE_FLUX_URL = " ${ 2 : ?missing flux url } "
shift 2
; ;
2026-04-06 04:47:05 -03:00
--force-flux-branch)
FORCE_FLUX_BRANCH = " ${ 2 : ?missing branch } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
--skip-etcd-snapshot)
SKIP_ETCD_SNAPSHOT = 1
shift
; ;
--skip-drain)
SKIP_DRAIN = 1
shift
; ;
--skip-local-bootstrap)
SKIP_LOCAL_BOOTSTRAP = 1
shift
; ;
--skip-harbor-bootstrap)
SKIP_HARBOR_BOOTSTRAP = 1
shift
; ;
2026-04-06 04:47:05 -03:00
--skip-harbor-seed)
SKIP_HARBOR_SEED = 1
shift
; ;
--skip-helper-prewarm)
SKIP_HELPER_PREWARM = 1
shift
2026-04-06 00:22:54 -03:00
; ;
2026-06-18 18:02:32 -03:00
--refresh-bootstrap-image-aliases)
REFRESH_BOOTSTRAP_IMAGE_ALIASES = 1
shift
; ;
2026-04-06 00:22:54 -03:00
--ups-host)
2026-04-06 04:47:05 -03:00
UPS_HOST = " ${ 2 : ?missing ups host } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
--ups-battery-key)
2026-04-06 04:47:05 -03:00
UPS_BATTERY_KEY = " ${ 2 : ?missing ups key } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
--min-startup-battery)
2026-04-06 04:47:05 -03:00
MIN_STARTUP_BATTERY = " ${ 2 : ?missing battery threshold } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
2026-04-06 04:47:05 -03:00
--require-ups-battery)
REQUIRE_UPS_BATTERY = 1
shift
; ;
2026-04-06 00:22:54 -03:00
--recovery-state-file)
2026-04-06 04:47:05 -03:00
RECOVERY_STATE_FILE = " ${ 2 : ?missing state file path } "
shift 2
; ;
2026-04-07 12:30:28 -03:00
--replica-snapshot-file)
REPLICA_SNAPSHOT_FILE = " ${ 2 : ?missing replica snapshot file path } "
shift 2
; ;
2026-04-06 04:47:05 -03:00
--harbor-bundle-file)
HARBOR_BUNDLE_FILE = " ${ 2 : ?missing bundle file path } "
shift 2
; ;
2026-06-18 18:02:32 -03:00
--longhorn-unlock-bundle-file)
LONGHORN_UNLOCK_BUNDLE_FILE = " ${ 2 : ?missing Longhorn unlock bundle file path } "
shift 2
; ;
--bootstrap-images-file)
BOOTSTRAP_IMAGES_FILE = " ${ 2 : ?missing bootstrap image list path } "
shift 2
; ;
--longhorn-unlock-images-file)
LONGHORN_UNLOCK_IMAGES_FILE = " ${ 2 : ?missing Longhorn unlock image list path } "
shift 2
; ;
--longhorn-manager-cache-bundle-file)
LONGHORN_MANAGER_CACHE_BUNDLE_FILE = " ${ 2 : ?missing Longhorn manager cache bundle file path } "
shift 2
; ;
--skip-longhorn-unlock-bundle-seed)
SKIP_LONGHORN_UNLOCK_BUNDLE_SEED = 1
shift
; ;
--bootstrap-bundle-arch)
BOOTSTRAP_BUNDLE_ARCH = " ${ 2 : ?missing bootstrap bundle architecture } "
shift 2
; ;
2026-04-06 04:47:05 -03:00
--harbor-target-node)
HARBOR_TARGET_NODE = " ${ 2 : ?missing harbor target node } "
shift 2
; ;
2026-04-06 21:27:23 -03:00
--harbor-canary-node)
HARBOR_CANARY_NODE = " ${ 2 : ?missing harbor canary node } "
shift 2
; ;
2026-04-06 21:32:43 -03:00
--harbor-host-label-key)
HARBOR_HOST_LABEL_KEY = " ${ 2 : ?missing harbor host label key } "
shift 2
; ;
2026-04-06 04:47:05 -03:00
--harbor-canary-image)
HARBOR_CANARY_IMAGE = " ${ 2 : ?missing canary image } "
shift 2
; ;
--node-helper-image)
NODE_HELPER_IMAGE = " ${ 2 : ?missing node helper image } "
shift 2
; ;
--bundle-http-port)
BUNDLE_HTTP_PORT = " ${ 2 : ?missing bundle http port } "
shift 2
; ;
--api-wait-timeout)
API_WAIT_TIMEOUT_SECONDS = " ${ 2 : ?missing api wait timeout } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
2026-04-07 12:30:28 -03:00
--flux-ready-timeout)
FLUX_READY_TIMEOUT_SECONDS = " ${ 2 : ?missing flux ready timeout } "
shift 2
; ;
--startup-checklist-timeout)
STARTUP_CHECKLIST_TIMEOUT_SECONDS = " ${ 2 : ?missing startup checklist timeout } "
shift 2
; ;
--startup-workload-timeout)
STARTUP_WORKLOAD_TIMEOUT_SECONDS = " ${ 2 : ?missing startup workload timeout } "
shift 2
; ;
--startup-stability-window)
STARTUP_STABILITY_WINDOW_SECONDS = " ${ 2 : ?missing startup stability window } "
shift 2
; ;
--startup-stability-timeout)
STARTUP_STABILITY_TIMEOUT_SECONDS = " ${ 2 : ?missing startup stability timeout } "
shift 2
; ;
2026-04-06 00:22:54 -03:00
--drain-timeout)
2026-04-06 04:47:05 -03:00
DRAIN_TIMEOUT_SECONDS = " ${ 2 : ?missing drain timeout } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
--emergency-drain-timeout)
2026-04-06 04:47:05 -03:00
EMERGENCY_DRAIN_TIMEOUT_SECONDS = " ${ 2 : ?missing emergency drain timeout } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
-h| --help)
usage
exit 0
; ;
*)
echo " Unknown option: $1 " >& 2
usage
exit 1
; ;
esac
done
2026-04-07 12:30:28 -03:00
case " ${ SHUTDOWN_MODE } " in
host-poweroff| cluster-only) ; ;
*)
echo " Invalid --shutdown-mode ' ${ SHUTDOWN_MODE } '. Expected host-poweroff or cluster-only. " >& 2
exit 1
; ;
esac
if [ [ -n " ${ FORCE_FLUX_URL } " && " ${ ALLOW_FLUX_SOURCE_MUTATION } " -ne 1 ] ] ; then
echo "--force-flux-url requires --allow-flux-source-mutation (breakglass)." >& 2
exit 1
fi
2026-04-06 00:22:54 -03:00
require_cmd( ) {
local cmd = " $1 "
if ! command -v " ${ cmd } " >/dev/null 2>& 1; then
echo " Missing required command: ${ cmd } " >& 2
exit 1
fi
}
require_cmd kubectl
2026-04-06 04:47:05 -03:00
require_cmd bash
require_cmd base64
require_cmd curl
2026-04-06 00:22:54 -03:00
log( ) { echo " [cluster-power] $* " ; }
warn( ) { echo " [cluster-power][warn] $* " >& 2; }
2026-04-06 04:47:05 -03:00
die( ) { echo " [cluster-power][error] $* " >& 2; exit 1; }
2026-04-06 00:22:54 -03:00
run( ) {
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
log " EXEC: $* "
" $@ "
else
log " DRY-RUN: $* "
fi
}
run_shell( ) {
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
log " EXEC: $* "
bash -lc " $* "
else
log " DRY-RUN: $* "
fi
}
2026-04-06 04:47:05 -03:00
apply_kustomization( ) {
local path = " $1 "
local full_path = " ${ REPO_DIR } / ${ path } "
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
log " EXEC: kubectl kustomize ${ full_path } --load-restrictor=LoadRestrictionsNone | kubectl apply -f - "
kubectl kustomize " ${ full_path } " --load-restrictor= LoadRestrictionsNone | kubectl apply -f -
2026-04-06 00:22:54 -03:00
else
2026-04-06 04:47:05 -03:00
log " DRY-RUN: kubectl kustomize ${ full_path } --load-restrictor=LoadRestrictionsNone | kubectl apply -f - "
2026-04-06 00:22:54 -03:00
fi
}
2026-04-06 04:47:05 -03:00
sanitize_name( ) {
printf '%s' " $1 " | tr '[:upper:]' '[:lower:]' | tr -cs 'a-z0-9-' '-'
2026-04-06 00:22:54 -03:00
}
2026-04-06 04:47:05 -03:00
state_dir( ) {
dirname " ${ RECOVERY_STATE_FILE } "
}
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
load_recovery_state( ) {
RECOVERY_PENDING = 0
STARTUP_ATTEMPTED_DURING_OUTAGE = 0
LAST_CHECKPOINT = "none"
[ [ -f " ${ RECOVERY_STATE_FILE } " ] ] || return 0
2026-04-06 00:22:54 -03:00
while IFS = '=' read -r key value; do
case " ${ key } " in
2026-04-06 04:47:05 -03:00
recovery_pending) RECOVERY_PENDING = " ${ value } " ; ;
startup_attempted) STARTUP_ATTEMPTED_DURING_OUTAGE = " ${ value } " ; ;
last_checkpoint) LAST_CHECKPOINT = " ${ value } " ; ;
2026-04-06 00:22:54 -03:00
esac
done < " ${ RECOVERY_STATE_FILE } "
}
save_recovery_state( ) {
2026-04-06 04:47:05 -03:00
[ [ " ${ EXECUTE } " -eq 1 ] ] || return 0
mkdir -p " $( state_dir) "
cat > " ${ RECOVERY_STATE_FILE } " <<STATE
2026-04-06 00:22:54 -03:00
recovery_pending = ${ 1 }
startup_attempted = ${ 2 }
2026-04-06 04:47:05 -03:00
last_checkpoint = ${ 3 }
STATE
}
mark_checkpoint( ) {
LAST_CHECKPOINT = " $1 "
save_recovery_state " ${ RECOVERY_PENDING } " " ${ STARTUP_ATTEMPTED_DURING_OUTAGE } " " ${ LAST_CHECKPOINT } "
2026-04-06 00:22:54 -03:00
}
clear_recovery_state( ) {
2026-04-06 04:47:05 -03:00
[ [ " ${ EXECUTE } " -eq 1 ] ] || return 0
rm -f " ${ RECOVERY_STATE_FILE } " 2>/dev/null || true
LAST_CHECKPOINT = "none"
}
sanitize_battery_percent( ) {
local raw = " $1 "
raw = " ${ raw ##* : } "
raw = " ${ raw //[[ : space : ]]/ } "
raw = " ${ raw %%.* } "
[ [ " ${ raw } " = ~ ^[ 0-9] +$ ] ] || return 1
printf '%s' " ${ raw } "
}
candidate_ups_hosts( ) {
local candidate name
local -A seen = ( )
if [ [ -n " ${ UPS_HOST } " ] ] ; then
seen[ " ${ UPS_HOST } " ] = 1
echo " ${ UPS_HOST } "
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
while IFS = read -r name; do
[ [ -n " ${ name } " ] ] || continue
for candidate in " ${ name } @localhost " " ${ name } " ; do
[ [ -n " ${ seen [ ${ candidate } ]+x } " ] ] && continue
seen[ " ${ candidate } " ] = 1
echo " ${ candidate } "
done
done < <( upsc -l 2>/dev/null || true )
2026-04-06 00:22:54 -03:00
}
read_ups_battery( ) {
if ! command -v upsc >/dev/null 2>& 1; then
return 1
fi
2026-04-06 04:47:05 -03:00
local host raw parsed
while IFS = read -r host; do
raw = " $( upsc " ${ host } " " ${ UPS_BATTERY_KEY } " 2>/dev/null || true ) "
[ [ -n " ${ raw } " ] ] || continue
parsed = " $( sanitize_battery_percent " ${ raw } " || true ) "
[ [ -n " ${ parsed } " ] ] || continue
UPS_HOST_IN_USE = " ${ host } "
printf '%s' " ${ parsed } "
return 0
done < <( candidate_ups_hosts)
return 1
2026-04-06 00:22:54 -03:00
}
ensure_minimum_battery_for_bootstrap( ) {
local battery
battery = " $( read_ups_battery || true ) "
if [ [ -z " ${ battery } " ] ] ; then
if [ [ " ${ REQUIRE_UPS_BATTERY } " -eq 1 ] ] ; then
warn "Unable to read UPS battery status and --require-ups-battery is set."
return 1
fi
warn "Unable to read UPS battery status; continuing without hard battery gating."
return 0
fi
2026-04-06 04:47:05 -03:00
log " ups-battery= ${ battery } % host= ${ UPS_HOST_IN_USE :- ${ UPS_HOST } } "
2026-04-06 00:22:54 -03:00
if ( ( battery < MIN_STARTUP_BATTERY ) ) ; then
warn " UPS battery ${ battery } % below minimum startup threshold ${ MIN_STARTUP_BATTERY } %. "
return 1
fi
return 0
}
report_flux_source_state( ) {
local flux_url flux_branch
flux_url = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.url}' 2>/dev/null || true ) "
flux_branch = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.ref.branch}' 2>/dev/null || true ) "
2026-04-06 04:47:05 -03:00
[ [ -n " ${ flux_url } " ] ] && log " flux-source-url= ${ flux_url } "
2026-04-06 00:22:54 -03:00
if [ [ -n " ${ flux_branch } " ] ] ; then
log " flux-source-branch= ${ flux_branch } "
2026-04-07 12:30:28 -03:00
fi
}
csv_has_value( ) {
local csv = " $1 "
local value = " $2 "
local needle = " , ${ value } , "
local haystack = " , ${ csv } , "
[ [ " ${ haystack } " = = *" ${ needle } " * ] ]
}
assert_flux_source_expected( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: skipping strict Flux source drift guard"
return 0
fi
local flux_url flux_branch
flux_url = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.url}' 2>/dev/null || true ) "
flux_branch = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.ref.branch}' 2>/dev/null || true ) "
[ [ -n " ${ flux_url } " ] ] || die "Unable to read Flux source URL from flux-system/gitrepository."
[ [ -n " ${ flux_branch } " ] ] || die "Unable to read Flux source branch from flux-system/gitrepository."
if [ [ -n " ${ EXPECTED_FLUX_URL } " && " ${ flux_url } " != " ${ EXPECTED_FLUX_URL } " ] ] ; then
die " Flux source URL drift detected: got ' ${ flux_url } ', expected ' ${ EXPECTED_FLUX_URL } '. Refusing startup. "
fi
if [ [ -z " ${ FORCE_FLUX_BRANCH } " && " ${ flux_branch } " != " ${ EXPECTED_FLUX_BRANCH } " ] ] ; then
die " Flux source branch drift detected: got ' ${ flux_branch } ', expected ' ${ EXPECTED_FLUX_BRANCH } '. Use --force-flux-branch to correct. "
fi
}
kustomization_is_optional( ) {
local name = " $1 "
[ [ -n " ${ STARTUP_OPTIONAL_KUSTOMIZATIONS } " ] ] || return 1
csv_has_value " ${ STARTUP_OPTIONAL_KUSTOMIZATIONS } " " ${ name } "
}
list_not_ready_kustomizations( ) {
local rows line name ready message
rows = " $( kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io \
-o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,MESSAGE:.status.conditions[?(@.type=="Ready")].message' \
--no-headers 2>/dev/null || true ) "
[ [ -n " ${ rows } " ] ] || return 0
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
name = " $( awk '{print $1}' <<< " ${ line } " ) "
ready = " $( awk '{print $2}' <<< " ${ line } " ) "
message = " ${ line # ${ name } } "
message = " ${ message # ${ ready } } "
if kustomization_is_optional " ${ name } " ; then
continue
fi
if [ [ " ${ ready } " != "True" ] ] ; then
printf '%s|%s\n' " ${ name } " " ${ message } "
fi
done <<< " ${ rows } "
}
trigger_flux_reconcile_all( ) {
local now
now = " $( date --iso-8601= seconds) "
run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt= " ${ now } " --overwrite
if command -v flux >/dev/null 2>& 1; then
run flux reconcile source git flux-system -n flux-system --timeout= 3m
fi
}
heal_failed_flux_jobs( ) {
local rows line ns name failed flux_owner helm_owner healed
healed = 0
rows = " $( kubectl get jobs.batch -A \
-o custom-columns= NS:.metadata.namespace,NAME:.metadata.name,FAILED:.status.failed,FLUX_OWNER:.metadata.labels.kustomize\\ .toolkit\\ .fluxcd\\ .io/name,HELM_OWNER:.metadata.labels.helm\\ .toolkit\\ .fluxcd\\ .io/name \
--no-headers 2>/dev/null || true ) "
[ [ -n " ${ rows } " ] ] || return 1
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
ns = " $( awk '{print $1}' <<< " ${ line } " ) "
name = " $( awk '{print $2}' <<< " ${ line } " ) "
failed = " $( awk '{print $3}' <<< " ${ line } " ) "
flux_owner = " $( awk '{print $4}' <<< " ${ line } " ) "
helm_owner = " $( awk '{print $5}' <<< " ${ line } " ) "
[ [ " ${ failed } " != "<none>" ] ] || continue
[ [ " ${ failed } " = ~ ^[ 0-9] +$ ] ] || continue
( ( failed > 0 ) ) || continue
if [ [ " ${ flux_owner } " = = "<none>" && " ${ helm_owner } " = = "<none>" ] ] ; then
continue
fi
warn " Deleting failed Flux-managed Job ${ ns } / ${ name } to heal immutable-template drift. "
run kubectl -n " ${ ns } " delete job " ${ name } " --ignore-not-found
healed = 1
done <<< " ${ rows } "
( ( healed = = 1 ) )
}
wait_for_flux_kustomizations_ready( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: skipping wait for all Flux kustomizations Ready"
return 0
fi
local start now not_ready immutable_hits
start = " $( date +%s) "
immutable_hits = 0
while true; do
not_ready = " $( list_not_ready_kustomizations || true ) "
if [ [ -z " ${ not_ready } " ] ] ; then
log "flux-kustomizations=all-ready"
return 0
fi
log "flux-kustomizations-not-ready:"
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
log " ${ line } "
done <<< " ${ not_ready } "
if grep -Eqi 'immutable|field is immutable|cannot patch.*Job|Job.*invalid' <<< " ${ not_ready } " ; then
if ( ( immutable_hits < 3 ) ) ; then
immutable_hits = $(( immutable_hits + 1 ))
warn " Detected immutable Job failure signal in Flux status. Attempting automated Job cleanup ( ${ immutable_hits } /3). "
if heal_failed_flux_jobs; then
trigger_flux_reconcile_all
fi
fi
fi
now = " $( date +%s) "
if ( ( now - start >= FLUX_READY_TIMEOUT_SECONDS ) ) ; then
die " Timed out waiting for Flux kustomizations Ready after ${ FLUX_READY_TIMEOUT_SECONDS } s. "
fi
sleep " ${ FLUX_READY_POLL_SECONDS } "
done
}
default_startup_service_checklist( ) {
cat <<'CHEC KS'
gitea| https://scm.bstein.dev/api/healthz| 200| "status" :"pass" ||
grafana| https://metrics.bstein.dev/api/health| 200| "database" :"ok" ||
2026-06-18 18:02:32 -03:00
harbor| https://registry.bstein.dev/v2/| 401| unauthorized| <html|
2026-04-07 12:30:28 -03:00
CHECKS
}
list_ingress_hosts( ) {
kubectl get ingress -A -o jsonpath = '{range .items[*]}{range .spec.rules[*]}{.host}{"\n"}{end}{end}' 2>/dev/null \
| sed '/^[[:space:]]*$/d' \
| sort -u
}
generated_ingress_service_checks( ) {
local host
while IFS = read -r host; do
[ [ -n " ${ host } " ] ] || continue
if [ [ -n " ${ STARTUP_IGNORE_INGRESS_HOSTS_REGEX } " ] ] && [ [ " ${ host } " = ~ ${ STARTUP_IGNORE_INGRESS_HOSTS_REGEX } ] ] ; then
continue
fi
printf 'ingress-%s|https://%s/|%s|||0|%s\n' " ${ host } " " ${ host } " " ${ STARTUP_INGRESS_ALLOWED_STATUSES } " " ${ STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS } "
done < <( list_ingress_hosts)
}
startup_service_checklist_rows( ) {
local base
if [ [ -n " ${ STARTUP_SERVICE_CHECKLIST } " ] ] ; then
base = " $( printf '%s' " ${ STARTUP_SERVICE_CHECKLIST } " | tr ';' '\n' ) "
else
base = " $( default_startup_service_checklist) "
fi
printf '%s\n' " ${ base } " | sed '/^[[:space:]]*$/d'
if [ [ " ${ STARTUP_INCLUDE_INGRESS_CHECKS } " = = "1" || " ${ STARTUP_INCLUDE_INGRESS_CHECKS } " = = "true" ] ] ; then
generated_ingress_service_checks
fi
}
service_status_allowed( ) {
local expected_csv = " $1 "
local got = " $2 "
local token
IFS = ',' read -r -a _statuses <<< " ${ expected_csv } "
for token in " ${ _statuses [@] } " ; do
if [ [ " ${ token } " = = " ${ got } " ] ] ; then
return 0
fi
done
return 1
}
2026-04-09 01:41:02 -03:00
check_mail_safeguards_once( ) {
local quiet = " ${ 1 :- 0 } "
local failures = 0 namespace service host port ips
local -a services = ( ) ports = ( )
if [ [ " ${ STARTUP_REQUIRE_MAIL_SAFEGUARDS } " != "1" && " ${ STARTUP_REQUIRE_MAIL_SAFEGUARDS } " != "true" ] ] ; then
return 0
fi
namespace = " ${ MAIL_STARTUP_NAMESPACE } "
as_array_from_csv " ${ MAIL_STARTUP_ENDPOINT_SERVICES } " services
for service in " ${ services [@] } " ; do
service = " ${ service //[[ : space : ]]/ } "
[ [ -n " ${ service } " ] ] || continue
ips = " $( kubectl -n " ${ namespace } " get endpoints " ${ service } " -o jsonpath = '{.subsets[*].addresses[*].ip}' 2>/dev/null || true ) "
if [ [ -z " ${ ips //[[ : space : ]]/ } " ] ] ; then
if [ [ " ${ quiet } " != "1" ] ] ; then
warn " startup-check mail-endpoints ${ namespace } / ${ service } : no ready endpoints. "
fi
failures = 1
fi
done
host = " ${ MAIL_STARTUP_HOST } "
if [ [ -n " ${ host } " ] ] ; then
as_array_from_csv " ${ MAIL_STARTUP_TCP_PORTS } " ports
for port in " ${ ports [@] } " ; do
port = " ${ port //[[ : space : ]]/ } "
[ [ " ${ port } " = ~ ^[ 0-9] +$ ] ] || continue
if ! timeout " ${ MAIL_STARTUP_TCP_TIMEOUT_SECONDS } " bash -lc " </dev/tcp/ ${ host } / ${ port } " >/dev/null 2>& 1; then
if [ [ " ${ quiet } " != "1" ] ] ; then
warn " startup-check mail-tcp ${ host } : ${ port } : connect failed. "
fi
failures = 1
fi
done
fi
( ( failures = = 0 ) )
}
2026-04-07 12:30:28 -03:00
check_startup_service_checklist_once( ) {
local rows row name url expected body_must body_must_not insecure timeout code rc
local body_file failures
failures = 0
rows = " $( startup_service_checklist_rows) "
while IFS = read -r row; do
[ [ -n " ${ row } " ] ] || continue
IFS = '|' read -r name url expected body_must body_must_not insecure timeout <<< " ${ row } "
[ [ -n " ${ name } " && -n " ${ url } " && -n " ${ expected } " ] ] || continue
[ [ -n " ${ insecure } " ] ] || insecure = 0
[ [ -n " ${ timeout } " ] ] || timeout = " ${ STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS } "
body_file = " $( mktemp) "
rc = 0
if [ [ " ${ insecure } " = = "1" || " ${ insecure } " = = "true" ] ] ; then
code = " $( curl -ksS --max-time " ${ timeout } " -o " ${ body_file } " -w '%{http_code}' " ${ url } " || rc = $? ) "
else
code = " $( curl -sS --max-time " ${ timeout } " -o " ${ body_file } " -w '%{http_code}' " ${ url } " || rc = $? ) "
fi
if ( ( rc != 0 ) ) ; then
warn " startup-check ${ name } : request failed (rc= ${ rc } ) url= ${ url } "
failures = 1
rm -f " ${ body_file } "
continue
fi
if ! service_status_allowed " ${ expected } " " ${ code } " ; then
warn " startup-check ${ name } : expected status ${ expected } , got ${ code } url= ${ url } "
failures = 1
rm -f " ${ body_file } "
continue
fi
if [ [ -n " ${ body_must } " ] ] && ! grep -Fq -- " ${ body_must } " " ${ body_file } " ; then
warn " startup-check ${ name } : missing required body fragment ' ${ body_must } ' "
failures = 1
rm -f " ${ body_file } "
continue
2026-04-06 00:22:54 -03:00
fi
2026-04-07 12:30:28 -03:00
if [ [ -n " ${ body_must_not } " ] ] && grep -Fq -- " ${ body_must_not } " " ${ body_file } " ; then
warn " startup-check ${ name } : forbidden body fragment ' ${ body_must_not } ' present "
failures = 1
rm -f " ${ body_file } "
continue
fi
rm -f " ${ body_file } "
done <<< " ${ rows } "
2026-04-09 01:41:02 -03:00
if ! check_mail_safeguards_once; then
failures = 1
fi
2026-04-07 12:30:28 -03:00
( ( failures = = 0 ) )
}
wait_for_startup_service_checklist( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: skipping startup external service checklist wait"
return 0
2026-04-06 00:22:54 -03:00
fi
2026-04-07 12:30:28 -03:00
local start now checklist_ok workloads_ok
start = " $( date +%s) "
while true; do
checklist_ok = 0
workloads_ok = 0
if check_startup_service_checklist_once; then
checklist_ok = 1
fi
if list_unhealthy_workloads | sed '/^[[:space:]]*$/d' | grep -q .; then
workloads_ok = 0
else
workloads_ok = 1
fi
if ( ( checklist_ok = = 1 && workloads_ok = = 1 ) ) ; then
log "startup-checklist=all-passed"
return 0
fi
if ( ( workloads_ok = = 0 ) ) ; then
warn "startup-checklist: workloads are not fully ready yet."
fi
now = " $( date +%s) "
if ( ( now - start >= STARTUP_CHECKLIST_TIMEOUT_SECONDS ) ) ; then
die " Timed out waiting for startup external checklist after ${ STARTUP_CHECKLIST_TIMEOUT_SECONDS } s. "
fi
sleep " ${ STARTUP_CHECKLIST_POLL_SECONDS } "
done
}
collect_unstable_pods( ) {
local rows
rows = " $( kubectl get pods -A --no-headers 2>/dev/null \
| awk '$4 ~ /(CrashLoopBackOff|ImagePullBackOff|ErrImagePull|CreateContainerConfigError|RunContainerError|InvalidImageName)/ {print $1 "/" $2 "|" $4}' || true ) "
if [ [ -n " ${ STARTUP_IGNORE_PODS_REGEX } " ] ] ; then
rows = " $( printf '%s\n' " ${ rows } " | grep -Ev " ${ STARTUP_IGNORE_PODS_REGEX } " || true ) "
fi
printf '%s' " ${ rows } "
}
wait_for_startup_stability_window( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: skipping startup stability window"
return 0
fi
local hard_deadline stable_since now unstable pods not_ready unhealthy_workloads
stable_since = " $( date +%s) "
hard_deadline = $(( stable_since + STARTUP_STABILITY_TIMEOUT_SECONDS ))
while true; do
unstable = 0
not_ready = " $( list_not_ready_kustomizations || true ) "
if [ [ -n " ${ not_ready } " ] ] ; then
unstable = 1
warn "stability-window: Flux kustomizations not ready."
fi
pods = " $( collect_unstable_pods || true ) "
if [ [ -n " ${ pods } " ] ] ; then
unstable = 1
warn "stability-window: unstable pods detected."
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
warn " ${ line } "
done <<< " ${ pods } "
fi
if ! check_startup_service_checklist_once; then
unstable = 1
warn "stability-window: external service checklist failed."
fi
unhealthy_workloads = " $( list_unhealthy_workloads || true ) "
if [ [ -n " ${ unhealthy_workloads } " ] ] ; then
unstable = 1
warn "stability-window: workloads not fully ready."
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
warn " ${ line } "
done <<< " ${ unhealthy_workloads } "
fi
now = " $( date +%s) "
if ( ( unstable = = 0 ) ) ; then
if ( ( now - stable_since >= STARTUP_STABILITY_WINDOW_SECONDS ) ) ; then
log " startup-stability-window=passed ( ${ STARTUP_STABILITY_WINDOW_SECONDS } s) "
return 0
fi
else
stable_since = " ${ now } "
fi
if ( ( now >= hard_deadline ) ) ; then
die " Timed out waiting for startup stability window ( ${ STARTUP_STABILITY_WINDOW_SECONDS } s healthy) within ${ STARTUP_STABILITY_TIMEOUT_SECONDS } s. "
fi
sleep " ${ STARTUP_STABILITY_POLL_SECONDS } "
done
2026-04-06 00:22:54 -03:00
}
wait_for_api( ) {
2026-04-06 04:47:05 -03:00
local attempts = $(( API_WAIT_TIMEOUT_SECONDS / 5 ))
if ( ( attempts < 1 ) ) ; then
attempts = 1
fi
2026-04-06 00:22:54 -03:00
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: skipping live Kubernetes API wait"
return 0
fi
local i
for i in $( seq 1 " ${ attempts } " ) ; do
if kubectl version --request-timeout= 5s >/dev/null 2>& 1; then
return 0
fi
2026-04-06 04:47:05 -03:00
sleep 5
2026-04-06 00:22:54 -03:00
done
return 1
}
2026-04-06 04:47:05 -03:00
patch_flux_suspend_all( ) {
local value = " $1 "
local patch
patch = $( printf '{"spec":{"suspend":%s}}' " ${ value } " )
local ks_list hr_list
ks_list = " $( kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath = '{range .items[*]}{.metadata.name}{"\n"}{end}' || true ) "
hr_list = " $( kubectl get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath = '{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' || true ) "
while IFS = read -r k; do
[ [ -z " ${ k } " ] ] && continue
run kubectl -n flux-system patch kustomization " ${ k } " --type= merge -p " ${ patch } "
done <<< " ${ ks_list } "
while IFS = read -r hr; do
[ [ -z " ${ hr } " ] ] && continue
local ns = " ${ hr %%/* } "
local name = " ${ hr ##*/ } "
run kubectl -n " ${ ns } " patch helmrelease " ${ name } " --type= merge -p " ${ patch } "
done <<< " ${ hr_list } "
}
2026-06-18 18:02:32 -03:00
patch_kustomization_suspend( ) {
local name = " $1 "
local value = " $2 "
local patch
patch = $( printf '{"spec":{"suspend":%s}}' " ${ value } " )
if kubectl -n flux-system get kustomization " ${ name } " >/dev/null 2>& 1; then
run kubectl -n flux-system patch kustomization " ${ name } " --type= merge -p " ${ patch } "
else
warn " Flux Kustomization ${ name } not found; skipping suspend= ${ value } . "
fi
}
2026-06-18 18:35:13 -03:00
csv_each( ) {
local csv = " $1 "
local item
IFS = ',' read -r -a _csv_items <<< " ${ csv } "
for item in " ${ _csv_items [@] } " ; do
item = " ${ item //[[ : space : ]]/ } "
[ [ -n " ${ item } " ] ] || continue
printf '%s\n' " ${ item } "
done
}
save_recovery_optional_flux_snapshot( ) {
[ [ " ${ EXECUTE } " -eq 1 ] ] || return 0
mkdir -p " $( dirname " ${ RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE } " ) "
: > " ${ RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE } "
local name suspend
while IFS = read -r name; do
if ! kubectl -n flux-system get kustomization " ${ name } " >/dev/null 2>& 1; then
continue
fi
suspend = " $( kubectl -n flux-system get kustomization " ${ name } " -o jsonpath = '{.spec.suspend}' 2>/dev/null || true ) "
[ [ -n " ${ suspend } " ] ] || suspend = "false"
printf '%s\t%s\n' " ${ name } " " ${ suspend } " >> " ${ RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE } "
done < <( csv_each " ${ RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS } " )
log " recovery-flux-optional-snapshot= ${ RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE } "
}
patch_recovery_optional_flux_suspend( ) {
local value = " $1 "
local name
if [ [ " ${ value } " = = "true" ] ] ; then
save_recovery_optional_flux_snapshot
fi
while IFS = read -r name; do
patch_kustomization_suspend " ${ name } " " ${ value } "
done < <( csv_each " ${ RECOVERY_FLUX_OPTIONAL_KUSTOMIZATIONS } " )
}
restore_recovery_optional_flux_suspend( ) {
[ [ -f " ${ RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE } " ] ] || return 0
local name suspend
while IFS = $'\t' read -r name suspend; do
[ [ -n " ${ name } " ] ] || continue
[ [ " ${ suspend } " = = "true" || " ${ suspend } " = = "false" ] ] || suspend = "false"
patch_kustomization_suspend " ${ name } " " ${ suspend } "
done < " ${ RECOVERY_FLUX_OPTIONAL_SNAPSHOT_FILE } "
}
annotate_flux_kustomizations( ) {
local now name
now = " $( date --iso-8601= seconds) "
while IFS = read -r name; do
if kubectl -n flux-system get kustomization " ${ name } " >/dev/null 2>& 1; then
run kubectl -n flux-system annotate kustomization " ${ name } " reconcile.fluxcd.io/requestedAt= " ${ now } " --overwrite
fi
done < <( csv_each " $1 " )
}
restart_kustomize_controller_for_critical_thaw( ) {
if [ [ " ${ RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER } " != "1" && " ${ RECOVERY_FLUX_RESTART_KUSTOMIZE_CONTROLLER } " != "true" ] ] ; then
return 0
fi
if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>& 1; then
warn "Restarting kustomize-controller after optional Flux suspension to clear any single-worker health-check backlog."
run kubectl -n flux-system rollout restart deployment kustomize-controller
fi
}
2026-06-18 18:02:32 -03:00
patch_helmrelease_suspend( ) {
local namespace = " $1 "
local name = " $2 "
local value = " $3 "
local patch
patch = $( printf '{"spec":{"suspend":%s}}' " ${ value } " )
if kubectl -n " ${ namespace } " get helmrelease " ${ name } " >/dev/null 2>& 1; then
run kubectl -n " ${ namespace } " patch helmrelease " ${ name } " --type= merge -p " ${ patch } "
else
warn " HelmRelease ${ namespace } / ${ name } not found; skipping suspend= ${ value } . "
fi
}
wait_for_flux_reconciler_pods_stopped( ) {
local app start now pods
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: wait for Flux reconcilers to stop"
return 0
fi
for app in kustomize-controller helm-controller; do
start = " $( date +%s) "
while true; do
pods = " $( kubectl -n flux-system get pods -l " app= ${ app } " --no-headers 2>/dev/null || true ) "
if [ [ -z " ${ pods } " ] ] ; then
log " flux-reconciler-stopped= ${ app } "
break
fi
now = " $( date +%s) "
if ( ( now - start >= 90 ) ) ; then
warn " Timed out waiting for ${ app } pods to stop. "
break
fi
sleep 2
done
done
}
freeze_longhorn_deadlock_automation( ) {
warn "Freezing only the automation that can fight Longhorn emergency recovery."
if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>& 1; then
run kubectl -n flux-system scale deployment kustomize-controller --replicas= 0
fi
if kubectl -n flux-system get deployment helm-controller >/dev/null 2>& 1; then
run kubectl -n flux-system scale deployment helm-controller --replicas= 0
fi
wait_for_flux_reconciler_pods_stopped
patch_kustomization_suspend flux-system true
patch_kustomization_suspend helm true
patch_kustomization_suspend longhorn true
patch_helmrelease_suspend longhorn-system longhorn true
mark_checkpoint longhorn_unlock_automation_frozen
}
ensure_longhorn_cache_first_policy( ) {
local values_patch ds_patch
values_patch = '{"spec":{"values":{"image":{"pullPolicy":"IfNotPresent"},"defaultSettings":{"systemManagedPodsImagePullPolicy":"if-not-present"}}}}'
ds_patch = '{"spec":{"template":{"spec":{"containers":[{"name":"longhorn-manager","imagePullPolicy":"IfNotPresent"}]}}}}'
if kubectl -n longhorn-system get helmrelease longhorn >/dev/null 2>& 1; then
run kubectl -n longhorn-system patch helmrelease longhorn --type= merge -p " ${ values_patch } "
fi
if kubectl -n longhorn-system get daemonset longhorn-manager >/dev/null 2>& 1; then
run kubectl -n longhorn-system patch daemonset longhorn-manager --type= strategic -p " ${ ds_patch } "
fi
}
remove_longhorn_manager_prepull_sidecar( ) {
local indexes index
indexes = " $( kubectl -n longhorn-system get daemonset longhorn-manager \
-o jsonpath = '{range .spec.template.spec.containers[*]}{.name}{"\n"}{end}' 2>/dev/null \
| nl -v 0 -w 1 -s ' ' \
| awk '$2=="pre-pull-share-manager-image" {print $1}' \
| sort -rn || true ) "
if [ [ -z " ${ indexes } " ] ] ; then
log "longhorn-manager-prepull-sidecar=absent"
return 0
fi
while IFS = read -r index; do
[ [ -z " ${ index } " ] ] && continue
run kubectl -n longhorn-system patch daemonset longhorn-manager --type= json \
-p " [{\"op\":\"remove\",\"path\":\"/spec/template/spec/containers/ ${ index } \"}] "
done <<< " ${ indexes } "
}
2026-06-18 18:20:22 -03:00
longhorn_manager_prepull_sidecar_has_pull_failures( ) {
kubectl -n longhorn-system get pods -l app = longhorn-manager -o json \
| jq -e '
[
.items[ ] .status.containerStatuses[ ] ?
| select ( .name = = "pre-pull-share-manager-image" )
| select ( ( ( .state.waiting.reason // "" ) | test( "ImagePullBackOff|ErrImagePull|CreateContainerError|RunContainerError|InvalidImageName" ) ) )
]
| length > 0' >/dev/null 2>& 1
}
remove_longhorn_manager_prepull_sidecar_if_needed( ) {
if ! harbor_endpoint_is_ready 1; then
warn "Removing Longhorn manager pre-pull sidecar because Harbor registry API is unhealthy."
remove_longhorn_manager_prepull_sidecar
return 0
fi
if longhorn_manager_prepull_sidecar_has_pull_failures; then
warn "Removing Longhorn manager pre-pull sidecar because it is in image/runtime failure."
remove_longhorn_manager_prepull_sidecar
return 0
fi
log "longhorn-manager-prepull-sidecar=retained harbor=healthy pull_failures=false"
}
2026-06-18 18:02:32 -03:00
save_longhorn_unlock_optional_replica_snapshot( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: save optional workload snapshot to ${ LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE } "
return 0
fi
if [ [ -s " ${ LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE } " ] ] ; then
log " optional-workload-snapshot=preserved path= ${ LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE } "
return 0
fi
mkdir -p " $( dirname " ${ LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE } " ) "
: > " ${ LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE } "
}
scale_optional_workload_for_longhorn_unlock( ) {
local namespace = " $1 "
local kind = " $2 "
local name = " $3 "
local replicas
if ! kubectl -n " ${ namespace } " get " ${ kind } " " ${ name } " >/dev/null 2>& 1; then
return 0
fi
replicas = " $( kubectl -n " ${ namespace } " get " ${ kind } " " ${ name } " -o jsonpath = '{.spec.replicas}' 2>/dev/null || true ) "
[ [ -n " ${ replicas } " ] ] || replicas = 1
if [ [ " ${ EXECUTE } " -eq 1 ] ] && ! awk -F '\t' -v ns = " ${ namespace } " -v kind = " ${ kind } " -v name = " ${ name } " '$1==ns && $2==kind && $3==name {found=1} END {exit found ? 0 : 1}' " ${ LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE } " 2>/dev/null; then
printf '%s\t%s\t%s\t%s\n' " ${ namespace } " " ${ kind } " " ${ name } " " ${ replicas } " >> " ${ LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE } "
fi
if [ [ " ${ replicas } " = = "0" ] ] ; then
log " optional-workload-already-scaled-down= ${ namespace } / ${ kind } / ${ name } "
return 0
fi
warn " Temporarily scaling optional workload ${ namespace } / ${ kind } / ${ name } from ${ replicas } to 0 for Longhorn recovery headroom. "
run kubectl -n " ${ namespace } " scale " ${ kind } " " ${ name } " --replicas= 0
}
free_longhorn_instance_manager_headroom( ) {
save_longhorn_unlock_optional_replica_snapshot
while read -r namespace kind name; do
[ [ -z " ${ namespace } " || " ${ namespace } " = = \# * ] ] && continue
scale_optional_workload_for_longhorn_unlock " ${ namespace } " " ${ kind } " " ${ name } "
done <<'WORKLOADS'
game-stream deployment oauth2-proxy-wolf
logging deployment oauth2-proxy-logs
longhorn-system deployment oauth2-proxy-longhorn
maintenance deployment oauth2-proxy-metis
maintenance deployment oauth2-proxy-soteria
openclaw deployment oauth2-proxy-agent
quality deployment oauth2-proxy-sonarqube
quality deployment sonarqube-exporter
sso deployment oauth2-proxy
bstein-dev-home deployment bstein-dev-home-frontend
WORKLOADS
mark_checkpoint longhorn_unlock_optional_workloads_scaled
}
restore_longhorn_unlock_optional_workloads( ) {
local namespace kind name desired current
if [ [ ! -f " ${ LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE } " ] ] ; then
log "optional-workload-restore=not-needed snapshot=absent"
return 0
fi
while IFS = $'\t' read -r namespace kind name desired; do
[ [ -n " ${ namespace } " && -n " ${ kind } " && -n " ${ name } " && -n " ${ desired } " ] ] || continue
[ [ " ${ desired } " = ~ ^[ 0-9] +$ ] ] || continue
( ( desired > 0 ) ) || continue
current = " $( kubectl -n " ${ namespace } " get " ${ kind } " " ${ name } " -o jsonpath = '{.spec.replicas}' 2>/dev/null || true ) "
[ [ " ${ current } " = ~ ^[ 0-9] +$ ] ] || continue
if ( ( current = = desired ) ) ; then
continue
fi
warn " Restoring optional workload ${ namespace } / ${ kind } / ${ name } to replicas= ${ desired } after Longhorn unlock. "
run kubectl -n " ${ namespace } " scale " ${ kind } " " ${ name } " --replicas= " ${ desired } "
done < " ${ LONGHORN_UNLOCK_REPLICA_SNAPSHOT_FILE } "
mark_checkpoint longhorn_unlock_optional_workloads_restored
}
restore_recovered_worker_scheduling_after_deadlock( ) {
local rows node ready worker taints
rows = " $( kubectl get nodes -o json \
| jq -r ' .items[ ]
| [ .metadata.name,
( .spec.unschedulable // false ) ,
( [ .status.conditions[ ] ? | select ( .type= = "Ready" ) | .status] [ 0] // "Unknown" ) ,
( .metadata.labels[ "node-role.kubernetes.io/worker" ] // "" ) ,
( ( .spec.taints // [ ] ) | map( .key + ":" + .effect) | join( "," ) ) ]
| @tsv' || true ) "
while IFS = $'\t' read -r node unschedulable ready worker taints; do
[ [ -n " ${ node } " ] ] || continue
[ [ " ${ unschedulable } " = = "true" ] ] || continue
[ [ " ${ ready } " = = "True" ] ] || continue
[ [ " ${ worker } " = = "true" ] ] || continue
if csv_has_value " ${ RECOVERY_UNCORDON_DENYLIST } " " ${ node } " ; then
warn " Leaving recovered worker ${ node } cordoned because it is in RECOVERY_UNCORDON_DENYLIST. "
continue
fi
if [ [ " ${ taints } " = = *"node.kubernetes.io/unreachable:" * ] ] ; then
warn " Leaving worker ${ node } cordoned because it still has an unreachable taint. "
continue
fi
warn " Restoring scheduling on recovered Ready worker ${ node } . "
run kubectl uncordon " ${ node } "
done <<< " ${ rows } "
mark_checkpoint longhorn_unlock_worker_scheduling_restored
}
delete_failed_nonstorage_pods_for_headroom( ) {
local rows namespace name
rows = " $( kubectl get pods -A --field-selector= status.phase= Failed \
-o jsonpath = '{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' 2>/dev/null || true ) "
while read -r namespace name; do
[ [ -z " ${ namespace } " || -z " ${ name } " ] ] && continue
case " ${ namespace } " in
longhorn-system| postgres| vault| gitea| harbor)
continue
; ;
esac
run kubectl -n " ${ namespace } " delete pod " ${ name } " --ignore-not-found --wait= false
done <<< " ${ rows } "
}
restart_stale_critical_pods_after_longhorn_unlock( ) {
require_cmd jq
local pods namespace name phase owners
pods = " $( kubectl get pods -A -o json \
| jq -r ' .items[ ]
| select ( .metadata.namespace | test( " ^(postgres|vault|gitea|harbor) $" ) )
| select ( .status.phase = = "Failed" or .status.phase = = "Unknown" )
| [ .metadata.namespace, .metadata.name, .status.phase, ( ( .metadata.ownerReferences // [ ] ) | length) ] | @tsv' || true ) "
while IFS = $'\t' read -r namespace name phase owners; do
[ [ -z " ${ namespace } " || -z " ${ name } " ] ] && continue
if [ [ " ${ owners } " = = "0" ] ] ; then
warn " Skipping stale critical pod without controller owner: ${ namespace } / ${ name } phase= ${ phase } "
continue
fi
warn " Deleting stale controller-owned critical pod ${ namespace } / ${ name } phase= ${ phase } so its controller can recreate it. "
run kubectl -n " ${ namespace } " delete pod " ${ name } " --ignore-not-found --wait= false
done <<< " ${ pods } "
pods = " $( kubectl get pods -A -o json \
| jq -r ' .items[ ]
| select ( .metadata.namespace | test( " ^(postgres|vault|gitea|harbor) $" ) )
| select ( .metadata.deletionTimestamp != null)
| select ( .status.phase = = "Failed" or .status.phase = = "Unknown" )
| select ( ( ( .metadata.finalizers // [ ] ) | length) = = 0)
| select ( ( ( .metadata.ownerReferences // [ ] ) | length) > 0)
| select ( ( [ ( .status.containerStatuses[ ] ? | select ( .state.terminated != null) ) ] | length) = = ( ( .status.containerStatuses // [ ] ) | length) )
| [ .metadata.namespace, .metadata.name, .status.phase] | @tsv' || true ) "
while IFS = $'\t' read -r namespace name phase; do
[ [ -z " ${ namespace } " || -z " ${ name } " ] ] && continue
warn " Force-deleting stale terminating critical pod object ${ namespace } / ${ name } phase= ${ phase } ; containers are already terminated and no finalizers are set. "
run kubectl -n " ${ namespace } " delete pod " ${ name } " --ignore-not-found --wait= false --force --grace-period= 0
done <<< " ${ pods } "
}
wait_for_postgres_dependency_ready( ) {
local timeout_seconds = " ${ 1 :- 240 } "
local start now endpoints
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: wait for postgres/postgres-service endpoints and pg_isready"
return 0
fi
start = " $( date +%s) "
while true; do
endpoints = " $( kubectl -n postgres get endpoints postgres-service -o jsonpath = '{.subsets[*].addresses[*].ip}' 2>/dev/null || true ) "
if [ [ -n " ${ endpoints //[[ : space : ]]/ } " ] ] \
&& kubectl -n postgres exec postgres-0 -c postgres -- sh -ceu 'pg_isready -h 127.0.0.1 -p 5432 >/dev/null' >/dev/null 2>& 1; then
log " postgres-dependency=ready endpoints= ${ endpoints } "
return 0
fi
now = " $( date +%s) "
if ( ( now - start >= timeout_seconds ) ) ; then
warn "Timed out waiting for Postgres to become ready for Harbor."
return 1
fi
sleep 5
done
}
restart_harbor_after_postgres_recovery( ) {
require_cmd jq
local pods name
if harbor_endpoint_is_ready 1; then
log "harbor-postgres-recovery=not-needed"
return 0
fi
wait_for_postgres_dependency_ready 240 || return 1
pods = " $( kubectl -n harbor get pods -o json \
| jq -r ' .items[ ]
| select ( .metadata.name | test( "^harbor-(core|jobservice)-" ) )
| select ( ( ( .metadata.ownerReferences // [ ] ) | length) > 0)
| select ( ( [
.status.containerStatuses[ ] ?
| select ( .name = = "core" or .name = = "jobservice" )
| select ( ( .ready != true )
or ( ( ( .state.waiting.reason // "" ) | test( "CrashLoopBackOff|ImagePullBackOff|ErrImagePull" ) ) )
or ( ( .lastState.terminated.reason // "" ) = = "Error" ) )
] | length) > 0)
| .metadata.name' \
| sort -u || true ) "
if [ [ -z " ${ pods } " ] ] ; then
warn "Harbor registry API is unhealthy, but no controller-owned core/jobservice pod needs restart."
return 1
fi
while IFS = read -r name; do
[ [ -z " ${ name } " ] ] && continue
warn " Restarting controller-owned Harbor pod ${ name } after Postgres recovery. "
run kubectl -n harbor delete pod " ${ name } " --ignore-not-found --wait= false
done <<< " ${ pods } "
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
kubectl -n harbor rollout status deployment/harbor-core --timeout= 6m || warn "harbor-core did not become Ready after Postgres recovery restart."
kubectl -n harbor rollout status deployment/harbor-jobservice --timeout= 6m || warn "harbor-jobservice did not become Ready after Postgres recovery restart."
harbor_endpoint_is_ready 0 || return 1
fi
mark_checkpoint longhorn_unlock_harbor_postgres_recovered
}
delete_safe_stale_terminating_replicaset_pods_after_deadlock( ) {
require_cmd jq
local rows namespace name deleted_at deleted_epoch now age
now = " $( date +%s) "
rows = " $( kubectl get pods -A -o json \
| jq -r ' .items[ ]
| select ( .metadata.namespace != "longhorn-system" )
| select ( .metadata.deletionTimestamp != null)
| select ( ( ( .metadata.finalizers // [ ] ) | length) = = 0)
| select ( ( ( .metadata.ownerReferences // [ ] ) | map( select ( .kind= = "ReplicaSet" ) ) | length) > 0)
| ( [ ( .status.initContainerStatuses[ ] ?, .status.containerStatuses[ ] ?) | select ( .state.running != null) ] | length) as $running
| ( [ ( .status.initContainerStatuses[ ] ?, .status.containerStatuses[ ] ?) | select ( .ready = = true ) ] | length) as $ready
| select ( $running = = 0 and $ready = = 0)
| [ .metadata.namespace, .metadata.name, .metadata.deletionTimestamp] | @tsv' || true ) "
while IFS = $'\t' read -r namespace name deleted_at; do
[ [ -n " ${ namespace } " && -n " ${ name } " && -n " ${ deleted_at } " ] ] || continue
deleted_epoch = " $( date -d " ${ deleted_at } " +%s 2>/dev/null || true ) "
[ [ " ${ deleted_epoch } " = ~ ^[ 0-9] +$ ] ] || continue
age = $(( now - deleted_epoch ))
if ( ( age < STALE_TERMINATING_POD_SECONDS ) ) ; then
continue
fi
warn " Force-deleting stale terminating ReplicaSet pod ${ namespace } / ${ name } ; no containers are running and no finalizers are set. "
run kubectl -n " ${ namespace } " delete pod " ${ name } " --ignore-not-found --wait= false --force --grace-period= 0
done <<< " ${ rows } "
mark_checkpoint longhorn_unlock_stale_replicaset_pods_cleared
}
restart_image_pull_backoff_pods_after_harbor_recovery( ) {
require_cmd jq
local pods namespace name
if ! harbor_endpoint_is_ready 1; then
warn "Skipping image-pull recovery sweep because Harbor registry API is still unhealthy."
return 1
fi
pods = " $( kubectl get pods -A -o json \
| jq -r ' .items[ ]
| select ( .metadata.namespace != "longhorn-system" )
| select ( ( ( .metadata.ownerReferences // [ ] ) | map( select ( .kind= = "ReplicaSet" ) ) | length) > 0)
| select ( ( [
( .status.containerStatuses[ ] ?, .status.initContainerStatuses[ ] ?)
| select ( ( ( .state.waiting.reason // "" ) | test( "ImagePullBackOff|ErrImagePull|CreateContainerError|RunContainerError|InvalidImageName" ) ) )
] | length) > 0)
| [ .metadata.namespace, .metadata.name] | @tsv' \
| sort -u || true ) "
if [ [ -z " ${ pods } " ] ] ; then
log "image-pull-recovery=not-needed"
return 0
fi
while IFS = $'\t' read -r namespace name; do
[ [ -z " ${ namespace } " || -z " ${ name } " ] ] && continue
warn " Restarting controller-owned pod ${ namespace } / ${ name } after Harbor recovery to clear image-pull backoff. "
run kubectl -n " ${ namespace } " delete pod " ${ name } " --ignore-not-found --wait= false
done <<< " ${ pods } "
mark_checkpoint longhorn_unlock_image_pull_backoff_restarted
}
resume_deadlock_automation_after_core_recovery( ) {
local gitea_endpoints
if ! harbor_endpoint_is_ready 1; then
warn "Keeping Flux reconcilers stopped because Harbor registry API is not healthy."
return 1
fi
gitea_endpoints = " $( kubectl -n gitea get endpoints gitea -o jsonpath = '{.subsets[*].addresses[*].ip}' 2>/dev/null || true ) "
if [ [ -z " ${ gitea_endpoints //[[ : space : ]]/ } " ] ] ; then
warn "Keeping Flux reconcilers stopped because Gitea has no ready endpoints."
return 1
fi
patch_flux_suspend_all false
2026-06-18 18:35:13 -03:00
patch_recovery_optional_flux_suspend true
2026-06-18 18:02:32 -03:00
if kubectl -n flux-system get deployment kustomize-controller >/dev/null 2>& 1; then
run kubectl -n flux-system scale deployment kustomize-controller --replicas= 1
fi
if kubectl -n flux-system get deployment helm-controller >/dev/null 2>& 1; then
run kubectl -n flux-system scale deployment helm-controller --replicas= 1
fi
2026-06-18 18:35:13 -03:00
restart_kustomize_controller_for_critical_thaw
if command -v flux >/dev/null 2>& 1; then
run flux reconcile source git flux-system -n flux-system --timeout= 3m || true
fi
annotate_flux_kustomizations " ${ RECOVERY_FLUX_CRITICAL_KUSTOMIZATIONS } " || true
2026-06-18 18:02:32 -03:00
mark_checkpoint longhorn_unlock_automation_resumed
}
restart_longhorn_image_pull_backoff_pods( ) {
require_cmd jq
local pods namespace name
pods = " $( kubectl -n longhorn-system get pods -o json \
| jq -r ' .items[ ]
| select ( ( [ .status.containerStatuses[ ] ?.state.waiting.reason] | map( select ( . = = "ImagePullBackOff" or . = = "ErrImagePull" ) ) | length) > 0)
| select ( .metadata.name | test( "^(longhorn-manager-|longhorn-driver-deployer-|longhorn-ui-)" ) )
| [ .metadata.namespace, .metadata.name] | @tsv' || true ) "
while IFS = $'\t' read -r namespace name; do
[ [ -z " ${ namespace } " || -z " ${ name } " ] ] && continue
run kubectl -n " ${ namespace } " delete pod " ${ name } " --ignore-not-found --wait= false
done <<< " ${ pods } "
}
terminating_running_pods_for_node( ) {
local node = " $1 "
local now
now = " $( date +%s) "
kubectl get pods -A -o json \
| jq -r --arg node " ${ node } " --argjson now " ${ now } " --argjson min_age " ${ STALE_TERMINATING_POD_SECONDS } " '
.items[ ]
| select ( .spec.nodeName = = $node )
| select ( .metadata.deletionTimestamp != null)
| select ( ( ( .metadata.finalizers // [ ] ) | length) = = 0)
| ( .metadata.deletionTimestamp | fromdateiso8601) as $deleted
| select ( ( $now - $deleted ) >= $min_age )
| ( [ ( .status.initContainerStatuses[ ] ?, .status.containerStatuses[ ] ?) | select ( .state.running != null) ] | length) as $running
| select ( $running > 0)
| [ .metadata.namespace, .metadata.name, ( $running | tostring) ] | @tsv' 2>/dev/null || true
}
stuck_terminating_runtime_cleanup_nodes( ) {
local now
now = " $( date +%s) "
kubectl get pods -A -o json \
| jq -r --argjson now " ${ now } " --argjson min_age " ${ STALE_TERMINATING_POD_SECONDS } " '
.items[ ]
| select ( .spec.nodeName != null)
| select ( .metadata.deletionTimestamp != null)
| select ( ( ( .metadata.finalizers // [ ] ) | length) = = 0)
| ( .metadata.deletionTimestamp | fromdateiso8601) as $deleted
| select ( ( $now - $deleted ) >= $min_age )
| select ( ( [ ( .status.initContainerStatuses[ ] ?, .status.containerStatuses[ ] ?) | select ( .state.running != null) ] | length) > 0)
| .spec.nodeName' 2>/dev/null \
| sort -u
}
wait_for_node_ready( ) {
local node = " $1 "
local timeout_seconds = " $2 "
local start now ready
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: wait for node ${ node } Ready "
return 0
fi
start = " $( date +%s) "
while true; do
ready = " $( kubectl get node " ${ node } " -o jsonpath = '{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || true ) "
if [ [ " ${ ready } " = = "True" ] ] ; then
log " node-ready= ${ node } "
return 0
fi
now = " $( date +%s) "
if ( ( now - start >= timeout_seconds ) ) ; then
warn " Timed out waiting for node ${ node } to return Ready after runtime restart. "
return 1
fi
sleep 5
done
}
wait_for_terminating_running_pods_to_clear( ) {
local node = " $1 "
local timeout_seconds = " $2 "
local start now pods
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: wait for stuck terminating running pods to clear on ${ node } "
return 0
fi
start = " $( date +%s) "
while true; do
pods = " $( terminating_running_pods_for_node " ${ node } " ) "
if [ [ -z " ${ pods } " ] ] ; then
log " stuck-terminating-runtime-pods-cleared= ${ node } "
return 0
fi
now = " $( date +%s) "
if ( ( now - start >= timeout_seconds ) ) ; then
warn " Stuck terminating pods with running containers remain on ${ node } : "
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
warn " ${ line } "
done <<< " ${ pods } "
return 1
fi
sleep 5
done
}
2026-06-18 18:04:11 -03:00
run_host_command_via_agent_restart_pod( ) {
local node = " $1 "
local host_command = " $2 "
local pod encoded_command
pod = " $( kubectl -n " ${ NODE_HELPER_NAMESPACE } " get pods -l app = k3s-agent-restart --field-selector " spec.nodeName= ${ node } ,status.phase=Running " -o jsonpath = '{.items[0].metadata.name}' 2>/dev/null || true ) "
if [ [ -z " ${ pod } " ] ] ; then
return 1
fi
encoded_command = " $( printf '%s' " ${ host_command } " | base64 -w0) "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: k3s-agent-restart exec via ${ pod } on ${ node } "
return 0
fi
run kubectl -n " ${ NODE_HELPER_NAMESPACE } " exec " ${ pod } " -- /bin/sh -ceu " HOST_COMMAND=\$(printf '%s' ' ${ encoded_command } ' | base64 -d); nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu \"\${HOST_COMMAND}\" "
}
2026-06-18 18:02:32 -03:00
schedule_host_service_restart_via_helper( ) {
local node = " $1 "
local service_name = " $2 "
local delay_seconds = " $3 "
local unit_name host_command
unit_name = " ananke-restart- ${ service_name } - $( date +%s) "
host_command = " /usr/bin/systemd-run --unit ${ unit_name } --on-active= ${ delay_seconds } s /bin/sh -lc '/usr/bin/systemctl restart ${ service_name } || /bin/systemctl restart ${ service_name } ' "
2026-06-18 18:04:11 -03:00
if run_host_command_via_agent_restart_pod " ${ node } " " ${ host_command } " ; then
return 0
fi
2026-06-18 18:02:32 -03:00
if run_host_command_via_prewarm_pod " ${ node } " " ${ host_command } " ; then
return 0
fi
run_host_command_via_helper " ${ node } " " restart- ${ node } - ${ service_name } " 120 " ${ host_command } "
}
recover_stuck_terminating_node_runtime_pods_after_deadlock( ) {
require_cmd jq
if [ [ " ${ RECOVERY_NODE_RUNTIME_RESTART_ENABLED } " != "1" && " ${ RECOVERY_NODE_RUNTIME_RESTART_ENABLED } " != "true" ] ] ; then
warn " Skipping node runtime cleanup because RECOVERY_NODE_RUNTIME_RESTART_ENABLED= ${ RECOVERY_NODE_RUNTIME_RESTART_ENABLED } . "
return 0
fi
local nodes node ready worker control_plane restarted max_nodes restarted_nodes
nodes = " $( stuck_terminating_runtime_cleanup_nodes || true ) "
if [ [ -z " ${ nodes } " ] ] ; then
log "node-runtime-cleanup=not-needed"
return 0
fi
max_nodes = " ${ RECOVERY_NODE_RUNTIME_RESTART_MAX_NODES } "
[ [ " ${ max_nodes } " = ~ ^[ 0-9] +$ ] ] || max_nodes = 1
restarted = 0
restarted_nodes = ""
while IFS = read -r node; do
[ [ -n " ${ node } " ] ] || continue
if ( ( restarted >= max_nodes ) ) ; then
warn " Node runtime cleanup limit reached ( ${ max_nodes } ); leaving remaining stuck nodes for a later Ananke pass. "
break
fi
if csv_has_value " ${ RECOVERY_NODE_RUNTIME_RESTART_DENYLIST } " " ${ node } " ; then
warn " Skipping node runtime cleanup on denylisted node ${ node } . "
continue
fi
ready = " $( kubectl get node " ${ node } " -o jsonpath = '{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || true ) "
worker = " $( kubectl get node " ${ node } " -o jsonpath = '{.metadata.labels.node-role\.kubernetes\.io/worker}' 2>/dev/null || true ) "
control_plane = " $( kubectl get node " ${ node } " -o jsonpath = '{.metadata.labels.node-role\.kubernetes\.io/control-plane}' 2>/dev/null || true ) "
if [ [ " ${ ready } " != "True" || " ${ worker } " != "true" || -n " ${ control_plane } " ] ] ; then
warn " Skipping node runtime cleanup on ${ node } ; ready= ${ ready :- unknown } worker= ${ worker :- false } control_plane= ${ control_plane :- false } . "
continue
fi
warn " Cordoning ${ node } and restarting only k3s-agent to clear stale terminating pods. Longhorn data-plane objects are not modified. "
run kubectl cordon " ${ node } "
schedule_host_service_restart_via_helper " ${ node } " k3s-agent 5 || warn " Failed to schedule k3s-agent restart on ${ node } . "
restarted = $(( restarted + 1 ))
restarted_nodes = " ${ restarted_nodes } ${ node } " $'\n'
done <<< " ${ nodes } "
if ( ( restarted = = 0 ) ) ; then
log "node-runtime-cleanup=no-eligible-nodes"
return 0
fi
sleep 15
while IFS = read -r node; do
[ [ -n " ${ node } " ] ] || continue
wait_for_node_ready " ${ node } " " ${ RECOVERY_NODE_RUNTIME_RESTART_WAIT_SECONDS } " || true
wait_for_terminating_running_pods_to_clear " ${ node } " " ${ RECOVERY_NODE_RUNTIME_RESTART_WAIT_SECONDS } " || true
done <<< " ${ restarted_nodes } "
mark_checkpoint longhorn_unlock_node_runtime_cleanup
}
wait_for_longhorn_endpoint( ) {
local endpoint = " $1 "
local timeout_seconds = " $2 "
local start now addresses
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: wait for Longhorn endpoint ${ endpoint } "
return 0
fi
start = " $( date +%s) "
while true; do
addresses = " $( kubectl -n longhorn-system get endpoints " ${ endpoint } " -o jsonpath = '{.subsets[*].addresses[*].ip}' 2>/dev/null || true ) "
if [ [ -n " ${ addresses } " ] ] ; then
log " longhorn-endpoint- ${ endpoint } =ready "
return 0
fi
now = " $( date +%s) "
if ( ( now - start >= timeout_seconds ) ) ; then
warn " Timed out waiting for Longhorn endpoint ${ endpoint } . "
return 1
fi
sleep 5
done
}
wait_for_longhorn_control_endpoints( ) {
local rc = 0
wait_for_longhorn_endpoint longhorn-admission-webhook 180 || rc = 1
wait_for_longhorn_endpoint longhorn-conversion-webhook 180 || rc = 1
wait_for_longhorn_endpoint longhorn-backend 180 || rc = 1
wait_for_longhorn_endpoint longhorn-recovery-backend 180 || rc = 1
return " ${ rc } "
}
report_longhorn_unlock_status( ) {
log "Longhorn manager DaemonSet:"
kubectl -n longhorn-system get daemonset longhorn-manager \
-o custom-columns= NAME:.metadata.name,DESIRED:.status.desiredNumberScheduled,CURRENT:.status.currentNumberScheduled,READY:.status.numberReady,UPDATED:.status.updatedNumberScheduled,AVAILABLE:.status.numberAvailable || true
log "Longhorn manager pods:"
kubectl -n longhorn-system get pods -l app = longhorn-manager \
-o custom-columns= NAME:.metadata.name,READY:.status.containerStatuses[ *] .ready,STATUS:.status.phase,WAIT:.status.containerStatuses[ *] .state.waiting.reason,NODE:.spec.nodeName --sort-by= .spec.nodeName || true
log "Longhorn instance managers:"
kubectl -n longhorn-system get instancemanagers.longhorn.io \
-o custom-columns= NAME:.metadata.name,STATE:.status.currentState,NODE:.spec.nodeID,IMAGE:.spec.image,TYPE:.spec.type --sort-by= .spec.nodeID || true
log "Longhorn volume summary:"
kubectl -n longhorn-system get volumes.longhorn.io -o json \
| jq -r '.items | group_by(.status.state + "/" + (.status.robustness // "none"))[] | [(.[0].status.state + "/" + (.[0].status.robustness // "none")), length] | @tsv' 2>/dev/null \
| sort || true
}
2026-04-07 12:30:28 -03:00
shutdown_namespace_excluded( ) {
local ns = " $1 "
[ [ " ${ ns } " = ~ ${ SHUTDOWN_NAMESPACE_EXCLUDES_REGEX } ] ]
}
startup_workload_namespace_excluded( ) {
local ns = " $1 "
[ [ " ${ ns } " = ~ ${ STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX } ] ]
}
2026-04-06 00:22:54 -03:00
best_effort_scale_down_apps( ) {
2026-04-07 12:30:28 -03:00
local ns_list ns
2026-04-06 00:22:54 -03:00
ns_list = " $( kubectl get ns -o jsonpath = '{range .items[*]}{.metadata.name}{"\n"}{end}' ) "
while IFS = read -r ns; do
[ [ -z " ${ ns } " ] ] && continue
2026-04-07 12:30:28 -03:00
if shutdown_namespace_excluded " ${ ns } " ; then
2026-04-06 00:22:54 -03:00
continue
fi
run_shell " kubectl -n ${ ns } scale deployment --all --replicas=0 || true "
run_shell " kubectl -n ${ ns } scale statefulset --all --replicas=0 || true "
done <<< " ${ ns_list } "
}
2026-04-07 12:30:28 -03:00
save_workload_replica_snapshot( ) {
local rows line ns kind name replicas
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: save workload replica snapshot to ${ REPLICA_SNAPSHOT_FILE } "
return 0
fi
rows = " $(
{
kubectl get deployment -A -o jsonpath = '{range .items[*]}{.metadata.namespace}{"\tdeployment\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\n"}{end}' 2>/dev/null || true
kubectl get statefulset -A -o jsonpath = '{range .items[*]}{.metadata.namespace}{"\tstatefulset\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\n"}{end}' 2>/dev/null || true
} | sed '/^[[:space:]]*$/d'
) "
mkdir -p " $( dirname " ${ REPLICA_SNAPSHOT_FILE } " ) "
: > " ${ REPLICA_SNAPSHOT_FILE } "
while IFS = $'\t' read -r ns kind name replicas; do
[ [ -n " ${ ns } " && -n " ${ kind } " && -n " ${ name } " && -n " ${ replicas } " ] ] || continue
shutdown_namespace_excluded " ${ ns } " && continue
[ [ " ${ replicas } " = ~ ^[ 0-9] +$ ] ] || continue
( ( replicas > 0 ) ) || continue
printf '%s\t%s\t%s\t%s\n' " ${ ns } " " ${ kind } " " ${ name } " " ${ replicas } " >> " ${ REPLICA_SNAPSHOT_FILE } "
done <<< " ${ rows } "
log " replica-snapshot-file= ${ REPLICA_SNAPSHOT_FILE } "
2026-04-09 01:41:02 -03:00
log " replica-snapshot-count= $( replica_snapshot_count) "
}
replica_snapshot_count( ) {
if [ [ -f " ${ REPLICA_SNAPSHOT_FILE } " ] ] ; then
wc -l < " ${ REPLICA_SNAPSHOT_FILE } " | tr -d ' '
else
printf '0'
fi
2026-04-07 12:30:28 -03:00
}
restore_workload_replica_snapshot( ) {
local ns kind name desired current
if [ [ " ${ RECOVERY_PENDING } " -ne 1 ] ] ; then
log "Skipping replica restore because recovery_pending=0."
return 0
fi
if [ [ ! -f " ${ REPLICA_SNAPSHOT_FILE } " ] ] ; then
warn " Replica snapshot file not found at ${ REPLICA_SNAPSHOT_FILE } ; skipping replica restore. "
return 0
fi
while IFS = $'\t' read -r ns kind name desired; do
[ [ -n " ${ ns } " && -n " ${ kind } " && -n " ${ name } " && -n " ${ desired } " ] ] || continue
[ [ " ${ desired } " = ~ ^[ 0-9] +$ ] ] || continue
( ( desired > 0 ) ) || continue
current = " $( kubectl -n " ${ ns } " get " ${ kind } " " ${ name } " -o jsonpath = '{.spec.replicas}' 2>/dev/null || true ) "
[ [ -n " ${ current } " ] ] || continue
[ [ " ${ current } " = ~ ^[ 0-9] +$ ] ] || current = 0
if ( ( current = = desired ) ) ; then
continue
fi
run kubectl -n " ${ ns } " scale " ${ kind } " " ${ name } " --replicas= " ${ desired } "
done < " ${ REPLICA_SNAPSHOT_FILE } "
mark_checkpoint startup_replicas_restored
}
2026-04-09 01:41:02 -03:00
restore_zero_scaled_helm_workloads( ) {
local rows ns kind name
local restored = 0
rows = " $(
{
kubectl get deployment -A -o custom-columns= NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\ .helm\\ .sh/release-name --no-headers 2>/dev/null \
| awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "<none>" {printf "%s\tdeployment\t%s\n", $1, $2}'
kubectl get statefulset -A -o custom-columns= NS:.metadata.namespace,NAME:.metadata.name,REPLICAS:.spec.replicas,HELM:.metadata.annotations.meta\\ .helm\\ .sh/release-name --no-headers 2>/dev/null \
| awk '$3 ~ /^[0-9]+$/ && $3 == 0 && $4 != "<none>" {printf "%s\tstatefulset\t%s\n", $1, $2}'
} | sed '/^[[:space:]]*$/d'
) "
while IFS = $'\t' read -r ns kind name; do
[ [ -n " ${ ns } " && -n " ${ kind } " && -n " ${ name } " ] ] || continue
startup_workload_namespace_excluded " ${ ns } " && continue
if [ [ -n " ${ STARTUP_IGNORE_WORKLOADS_REGEX } " ] ] && [ [ " ${ ns } / ${ name } " = ~ ${ STARTUP_IGNORE_WORKLOADS_REGEX } ] ] ; then
continue
fi
warn " Auto-heal: restoring zero-scaled Helm workload ${ ns } / ${ kind } / ${ name } to replicas=1. "
run kubectl -n " ${ ns } " scale " ${ kind } " " ${ name } " --replicas= 1
restored = $(( restored + 1 ))
done <<< " ${ rows } "
if ( ( restored > 0 ) ) ; then
log " Auto-heal: restored ${ restored } zero-scaled Helm workloads. "
mark_checkpoint startup_zero_scaled_helm_restored
else
log "Auto-heal: no zero-scaled Helm workloads detected."
fi
}
2026-04-07 12:30:28 -03:00
list_unhealthy_workloads( ) {
local rows line ns name desired ready available
rows = " $( kubectl get deployment -A -o custom-columns= NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas --no-headers 2>/dev/null || true ) "
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
ns = " $( awk '{print $1}' <<< " ${ line } " ) "
name = " $( awk '{print $2}' <<< " ${ line } " ) "
desired = " $( awk '{print $3}' <<< " ${ line } " ) "
ready = " $( awk '{print $4}' <<< " ${ line } " ) "
available = " $( awk '{print $5}' <<< " ${ line } " ) "
startup_workload_namespace_excluded " ${ ns } " && continue
[ [ -n " ${ STARTUP_IGNORE_WORKLOADS_REGEX } " && " ${ ns } / ${ name } " = ~ ${ STARTUP_IGNORE_WORKLOADS_REGEX } ] ] && continue
[ [ " ${ desired } " = ~ ^[ 0-9] +$ ] ] || desired = 0
[ [ " ${ ready } " = ~ ^[ 0-9] +$ ] ] || ready = 0
[ [ " ${ available } " = ~ ^[ 0-9] +$ ] ] || available = 0
( ( desired > 0 ) ) || continue
if ( ( ready < desired || available < desired ) ) ; then
printf '%s/deployment/%s|ready=%s available=%s desired=%s\n' " ${ ns } " " ${ name } " " ${ ready } " " ${ available } " " ${ desired } "
fi
done <<< " ${ rows } "
rows = " $( kubectl get statefulset -A -o custom-columns= NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas --no-headers 2>/dev/null || true ) "
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
ns = " $( awk '{print $1}' <<< " ${ line } " ) "
name = " $( awk '{print $2}' <<< " ${ line } " ) "
desired = " $( awk '{print $3}' <<< " ${ line } " ) "
ready = " $( awk '{print $4}' <<< " ${ line } " ) "
startup_workload_namespace_excluded " ${ ns } " && continue
[ [ -n " ${ STARTUP_IGNORE_WORKLOADS_REGEX } " && " ${ ns } / ${ name } " = ~ ${ STARTUP_IGNORE_WORKLOADS_REGEX } ] ] && continue
[ [ " ${ desired } " = ~ ^[ 0-9] +$ ] ] || desired = 0
[ [ " ${ ready } " = ~ ^[ 0-9] +$ ] ] || ready = 0
( ( desired > 0 ) ) || continue
if ( ( ready < desired ) ) ; then
printf '%s/statefulset/%s|ready=%s desired=%s\n' " ${ ns } " " ${ name } " " ${ ready } " " ${ desired } "
fi
done <<< " ${ rows } "
}
wait_for_startup_workloads_ready( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: skipping startup workload readiness checks"
return 0
fi
local start now unhealthy
start = " $( date +%s) "
while true; do
unhealthy = " $( list_unhealthy_workloads || true ) "
if [ [ -z " ${ unhealthy } " ] ] ; then
log "startup-workloads=all-ready"
return 0
fi
warn "startup-workloads-not-ready:"
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
warn " ${ line } "
done <<< " ${ unhealthy } "
now = " $( date +%s) "
if ( ( now - start >= STARTUP_WORKLOAD_TIMEOUT_SECONDS ) ) ; then
die " Timed out waiting for startup workloads Ready after ${ STARTUP_WORKLOAD_TIMEOUT_SECONDS } s. "
fi
sleep " ${ STARTUP_WORKLOAD_POLL_SECONDS } "
done
}
2026-04-06 04:47:05 -03:00
discover_workers_csv( ) {
kubectl get nodes \
2026-04-07 12:30:28 -03:00
-o 'custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\.kubernetes\.io/control-plane,MASTER:.metadata.labels.node-role\.kubernetes\.io/master,READY:.status.conditions[?(@.type=="Ready")].status' \
2026-04-06 04:47:05 -03:00
--no-headers \
2026-04-07 12:30:28 -03:00
| awk '$2=="<none>" && $3=="<none>" && $4=="True" {print $1}' \
2026-04-06 04:47:05 -03:00
| paste -sd, -
}
2026-04-06 21:27:23 -03:00
node_is_ready( ) {
local node = " $1 "
[ [ -n " ${ node } " ] ] || return 1
local ready
ready = " $( kubectl get node " ${ node } " -o jsonpath = '{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || true ) "
[ [ " ${ ready } " = = "True" ] ]
}
select_ready_arm64_worker( ) {
local rows node
rows = " $( kubectl get nodes -o 'custom-columns=NAME:.metadata.name,ARCH:.metadata.labels.kubernetes\.io/arch,WORKER:.metadata.labels.node-role\.kubernetes\.io/worker,HARDWARE:.metadata.labels.hardware,READY:.status.conditions[?(@.type=="Ready")].status' --no-headers 2>/dev/null || true ) "
[ [ -n " ${ rows } " ] ] || return 1
node = " $( printf '%s\n' " ${ rows } " | awk '$2=="arm64" && $3=="true" && $4=="rpi5" && $5=="True" {print $1; exit}' ) "
if [ [ -n " ${ node } " ] ] ; then
printf '%s' " ${ node } "
return 0
fi
node = " $( printf '%s\n' " ${ rows } " | awk '$2=="arm64" && $3=="true" && $4=="rpi4" && $5=="True" {print $1; exit}' ) "
if [ [ -n " ${ node } " ] ] ; then
printf '%s' " ${ node } "
return 0
fi
node = " $( printf '%s\n' " ${ rows } " | awk '$2=="arm64" && $3=="true" && $5=="True" {print $1; exit}' ) "
if [ [ -n " ${ node } " ] ] ; then
printf '%s' " ${ node } "
return 0
fi
return 1
}
2026-06-18 18:02:32 -03:00
discover_harbor_pinned_node( ) {
kubectl -n harbor get helmrelease harbor \
-o jsonpath = '{range .spec.values..nodeSelector}{.kubernetes\.io/hostname}{"\n"}{end}' 2>/dev/null \
| sed '/^[[:space:]]*$/d' \
| sort -u \
| head -n 1
}
2026-04-06 21:27:23 -03:00
ensure_harbor_target_node( ) {
if node_is_ready " ${ HARBOR_TARGET_NODE } " ; then
return 0
fi
2026-06-18 18:02:32 -03:00
local fallback pinned
pinned = " $( discover_harbor_pinned_node || true ) "
if node_is_ready " ${ pinned } " ; then
if [ [ -n " ${ HARBOR_TARGET_NODE } " ] ] ; then
warn " Configured harbor target node ' ${ HARBOR_TARGET_NODE } ' is not Ready; using live Harbor pin ' ${ pinned } ' instead. "
else
log " harbor-target-node discovered from live HelmRelease: ${ pinned } "
fi
HARBOR_TARGET_NODE = " ${ pinned } "
return 0
fi
2026-04-06 21:27:23 -03:00
fallback = " $( select_ready_arm64_worker || true ) "
[ [ -n " ${ fallback } " ] ] || die "No Ready arm64 worker available for Harbor bootstrap target."
if [ [ -n " ${ HARBOR_TARGET_NODE } " ] ] ; then
warn " Configured harbor target node ' ${ HARBOR_TARGET_NODE } ' is not Ready; using ' ${ fallback } ' instead. "
else
log " harbor-target-node auto-selected: ${ fallback } "
fi
HARBOR_TARGET_NODE = " ${ fallback } "
}
2026-04-06 21:32:43 -03:00
ensure_harbor_host_label( ) {
[ [ -n " ${ HARBOR_TARGET_NODE } " ] ] || die "Harbor target node is not set."
local labeled node
labeled = " $( kubectl get nodes -l " ${ HARBOR_HOST_LABEL_KEY } =true " -o jsonpath = '{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true ) "
while IFS = read -r node; do
[ [ -z " ${ node } " ] ] && continue
[ [ " ${ node } " = = " ${ HARBOR_TARGET_NODE } " ] ] && continue
run kubectl label node " ${ node } " " ${ HARBOR_HOST_LABEL_KEY } - "
done <<< " ${ labeled } "
run kubectl label node " ${ HARBOR_TARGET_NODE } " " ${ HARBOR_HOST_LABEL_KEY } =true " --overwrite
}
2026-04-06 04:47:05 -03:00
as_array_from_csv( ) {
local csv = " $1 "
local out_var = " $2 "
local old_ifs = " ${ IFS } "
IFS = ',' read -r -a _tmp <<< " ${ csv } "
IFS = " ${ old_ifs } "
eval " ${ out_var } " '=( "${_tmp[@]}" )'
}
2026-04-06 00:22:54 -03:00
best_effort_drain_workers( ) {
local timeout_seconds = " $1 "
shift || true
local workers = ( " $@ " )
local node
for node in " ${ workers [@] } " ; do
[ [ -z " ${ node } " ] ] && continue
run kubectl cordon " ${ node } "
if run_shell " kubectl drain ${ node } --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout= ${ timeout_seconds } s " ; then
continue
fi
warn " Gentle drain timed out for ${ node } ; retrying with --force. "
if run_shell " kubectl drain ${ node } --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout= ${ timeout_seconds } s --force " ; then
continue
fi
warn " Force drain timed out for ${ node } ; final attempt with --disable-eviction. "
run_shell " kubectl drain ${ node } --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout= ${ timeout_seconds } s --force --disable-eviction || true "
done
}
2026-04-06 04:47:05 -03:00
wait_for_rollout( ) {
local namespace = " $1 "
local kind = " $2 "
local name = " $3 "
local timeout = " $4 "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: kubectl -n ${ namespace } rollout status ${ kind } / ${ name } --timeout= ${ timeout } "
return 0
fi
kubectl -n " ${ namespace } " rollout status " ${ kind } / ${ name } " --timeout= " ${ timeout } "
2026-04-06 00:22:54 -03:00
}
2026-04-06 04:47:05 -03:00
check_ingress_stack( ) {
kubectl get ingressclass traefik >/dev/null
wait_for_rollout traefik deployment traefik 5m
}
check_longhorn_stack( ) {
wait_for_rollout longhorn-system daemonset longhorn-manager 10m
wait_for_rollout longhorn-system deployment longhorn-ui 10m
}
check_vault_stack( ) {
wait_for_rollout vault statefulset vault 10m
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
kubectl -n vault exec vault-0 -- sh -ceu 'VAULT_ADDR=http://127.0.0.1:8200 vault status >/dev/null'
fi
}
check_postgres_stack( ) {
wait_for_rollout postgres statefulset postgres 10m
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
kubectl -n postgres exec postgres-0 -c postgres -- sh -ceu 'pg_isready -h 127.0.0.1 -p 5432 >/dev/null'
fi
}
check_gitea_stack( ) {
wait_for_rollout gitea deployment gitea 10m
}
check_harbor_stack( ) {
wait_for_rollout harbor statefulset harbor-redis 10m
wait_for_rollout harbor deployment harbor-core 10m
wait_for_rollout harbor deployment harbor-jobservice 10m
wait_for_rollout harbor deployment harbor-portal 10m
wait_for_rollout harbor deployment harbor-registry 10m
}
2026-06-18 18:02:32 -03:00
harbor_registry_response_valid( ) {
local code = " $1 "
local headers_file = " $2 "
local body_file = " $3 "
local content_type
case " ${ code } " in
200| 401) ; ;
*) return 1 ; ;
esac
content_type = " $( awk 'BEGIN{IGNORECASE=1} /^content-type:/ {print tolower($0); exit}' " ${ headers_file } " 2>/dev/null || true ) "
if [ [ " ${ content_type } " = = *"text/html" * ] ] ; then
return 1
fi
if grep -Eiq '^docker-distribution-api-version:' " ${ headers_file } " 2>/dev/null; then
return 0
fi
if [ [ " ${ code } " = = "401" ] ] && grep -Eiq 'unauthorized|authentication required' " ${ body_file } " 2>/dev/null; then
return 0
fi
return 1
}
harbor_endpoint_is_ready( ) {
local quiet = " ${ 1 :- 0 } "
2026-04-06 04:47:05 -03:00
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/"
return 0
fi
2026-06-18 18:02:32 -03:00
local headers_file body_file code rc content_type
headers_file = " $( mktemp) "
body_file = " $( mktemp) "
rc = 0
code = " $( curl -ksS --max-time " ${ STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS } " -D " ${ headers_file } " -o " ${ body_file } " -w '%{http_code}' https://registry.bstein.dev/v2/ || rc = $? ) "
content_type = " $( awk 'BEGIN{IGNORECASE=1} /^content-type:/ {print tolower($0); exit}' " ${ headers_file } " 2>/dev/null || true ) "
if ( ( rc = = 0 ) ) && harbor_registry_response_valid " ${ code } " " ${ headers_file } " " ${ body_file } " ; then
[ [ " ${ quiet } " = = "1" ] ] || log " harbor-endpoint=http- ${ code } registry-api=true "
rm -f " ${ headers_file } " " ${ body_file } "
return 0
fi
[ [ " ${ quiet } " = = "1" ] ] || warn " Harbor registry API check failed: http= ${ code :- unknown } content-type= ${ content_type :- unknown } rc= ${ rc } "
rm -f " ${ headers_file } " " ${ body_file } "
return 1
}
check_harbor_endpoint( ) {
if ! harbor_endpoint_is_ready 0; then
die "Harbor endpoint is not serving the registry API."
fi
2026-04-06 04:47:05 -03:00
}
wait_for_pod_phase( ) {
local namespace = " $1 "
local pod = " $2 "
local expected_phase = " $3 "
local timeout_seconds = " $4 "
local start now phase
start = " $( date +%s) "
while true; do
phase = " $( kubectl -n " ${ namespace } " get pod " ${ pod } " -o jsonpath = '{.status.phase}' 2>/dev/null || true ) "
if [ [ " ${ phase } " = = " ${ expected_phase } " ] ] ; then
return 0
fi
if [ [ " ${ phase } " = = "Failed" ] ] ; then
return 1
fi
now = " $( date +%s) "
if ( ( now - start >= timeout_seconds ) ) ; then
return 1
fi
sleep 2
2026-04-06 00:22:54 -03:00
done
}
2026-04-06 04:47:05 -03:00
harbor_is_ready( ) {
kubectl -n harbor get deploy harbor-core harbor-jobservice harbor-portal harbor-registry >/dev/null 2>& 1 || return 1
2026-06-18 18:02:32 -03:00
harbor_endpoint_is_ready 1
2026-04-06 04:47:05 -03:00
}
run_harbor_pull_canary( ) {
2026-04-06 21:27:23 -03:00
local pod = "ananke-harbor-canary"
local canary_node = " ${ HARBOR_CANARY_NODE } "
if ! node_is_ready " ${ canary_node } " ; then
ensure_harbor_target_node
canary_node = " ${ HARBOR_TARGET_NODE } "
if [ [ -n " ${ HARBOR_CANARY_NODE } " ] ] ; then
warn " Configured harbor canary node ' ${ HARBOR_CANARY_NODE } ' is not Ready; using ' ${ canary_node } '. "
fi
HARBOR_CANARY_NODE = " ${ canary_node } "
fi
2026-04-06 04:47:05 -03:00
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
2026-04-06 21:27:23 -03:00
log " DRY-RUN: create Harbor pull canary pod with ${ HARBOR_CANARY_IMAGE } on ${ canary_node } "
2026-04-06 04:47:05 -03:00
return 0
fi
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
cat <<CANARY | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: ${ pod }
namespace: ${ NODE_HELPER_NAMESPACE }
spec:
2026-04-06 21:27:23 -03:00
nodeName: ${ canary_node }
2026-04-06 04:47:05 -03:00
restartPolicy: Never
imagePullSecrets:
- name: ${ REGISTRY_PULL_SECRET }
tolerations:
- operator: Exists
containers:
- name: canary
image: ${ HARBOR_CANARY_IMAGE }
imagePullPolicy: Always
command: [ "sh" , "-ceu" , "echo harbor-canary-ok" ]
CANARY
if ! wait_for_pod_phase " ${ NODE_HELPER_NAMESPACE } " " ${ pod } " Succeeded 180; then
kubectl -n " ${ NODE_HELPER_NAMESPACE } " describe pod " ${ pod } " >& 2 || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " logs " ${ pod } " >& 2 || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
return 1
fi
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " logs " ${ pod } " || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
}
run_helper_pod( ) {
local node = " $1 "
local purpose = " $2 "
local timeout_seconds = " $3 "
local script_content = " $4 "
2026-04-06 21:27:23 -03:00
local pod = " ananke- $( sanitize_name " ${ purpose } " ) - $( date +%H%M%S) "
2026-04-06 04:47:05 -03:00
local encoded_script
encoded_script = " $( printf '%s' " ${ script_content } " | base64 -w0) "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: helper pod ${ pod } on ${ node } for ${ purpose } "
return 0
fi
cat <<POD | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: ${ pod }
namespace: ${ NODE_HELPER_NAMESPACE }
spec:
nodeName: ${ node }
restartPolicy: Never
serviceAccountName: ${ NODE_HELPER_SERVICE_ACCOUNT }
imagePullSecrets:
- name: ${ REGISTRY_PULL_SECRET }
hostNetwork: true
hostPID: true
tolerations:
- operator: Exists
containers:
- name: helper
image: ${ NODE_HELPER_IMAGE }
imagePullPolicy: IfNotPresent
securityContext:
privileged: true
command: [ "/bin/bash" , "-ceu" ]
args:
- |
2026-04-06 21:27:23 -03:00
printf '%s' '${encoded_script}' | base64 -d >/tmp/ananke-step.sh
chmod +x /tmp/ananke-step.sh
/tmp/ananke-step.sh
2026-04-06 04:47:05 -03:00
POD
if ! wait_for_pod_phase " ${ NODE_HELPER_NAMESPACE } " " ${ pod } " Succeeded " ${ timeout_seconds } " ; then
kubectl -n " ${ NODE_HELPER_NAMESPACE } " describe pod " ${ pod } " >& 2 || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " logs " ${ pod } " >& 2 || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
return 1
fi
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " logs " ${ pod } " || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
}
2026-06-18 18:02:32 -03:00
hostroot_pod_for_node( ) {
local node = " $1 "
kubectl -n " ${ NODE_HELPER_NAMESPACE } " get pods \
-l app = node-image-sweeper \
--field-selector " spec.nodeName= ${ node } ,status.phase=Running " \
-o jsonpath = '{.items[0].metadata.name}' 2>/dev/null || true
}
run_hostroot_pod_script( ) {
local node = " $1 "
local purpose = " $2 "
local timeout_seconds = " $3 "
local script_content = " $4 "
local pod encoded_script
pod = " $( hostroot_pod_for_node " ${ node } " ) "
[ [ -n " ${ pod } " ] ] || return 1
encoded_script = " $( printf '%s' " ${ script_content } " | base64 -w0) "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: hostroot pod ${ pod } on ${ node } for ${ purpose } "
return 0
fi
timeout " ${ timeout_seconds } " kubectl -n " ${ NODE_HELPER_NAMESPACE } " exec " ${ pod } " -- /bin/sh -ceu " printf '%s' ' ${ encoded_script } ' | base64 -d | chroot /host /bin/sh -seu "
}
run_hostroot_pod_bundle_import( ) {
local node = " $1 "
local timeout_seconds = " $2 "
local images_text = " $3 "
local pod refresh_script verify_script encoded_script
pod = " $( hostroot_pod_for_node " ${ node } " ) "
[ [ -n " ${ pod } " ] ] || return 1
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: stream ${ HARBOR_BUNDLE_FILE } through hostroot pod ${ pod } on ${ node } "
return 0
fi
if [ [ " ${ REFRESH_BOOTSTRAP_IMAGE_ALIASES } " = = "1" ] ] ; then
refresh_script = $( cat <<SCRIPT
set -eu
while IFS = read -r image; do
[ -z "\${image}" ] && continue
/usr/local/bin/k3s ctr images rm "\${image}" >/dev/null 2>& 1 || true
done <<'IMAGES'
${ images_text }
IMAGES
SCRIPT
)
encoded_script = " $( printf '%s' " ${ refresh_script } " | base64 -w0) "
timeout 120 kubectl -n " ${ NODE_HELPER_NAMESPACE } " exec " ${ pod } " -- /bin/sh -ceu " printf '%s' ' ${ encoded_script } ' | base64 -d | chroot /host /bin/sh -seu "
fi
timeout " ${ timeout_seconds } " kubectl -n " ${ NODE_HELPER_NAMESPACE } " exec -i " ${ pod } " -- \
chroot /host /bin/sh -ceu '/usr/bin/zstd -dc | /usr/local/bin/k3s ctr images import -' < " ${ HARBOR_BUNDLE_FILE } "
verify_script = $( cat <<SCRIPT
set -eu
while IFS = read -r image; do
[ -z "\${image}" ] && continue
/usr/local/bin/k3s ctr images ls | awk '{print \$1}' | grep -Fx "\${image}" >/dev/null
done <<'IMAGES'
${ images_text }
IMAGES
SCRIPT
)
encoded_script = " $( printf '%s' " ${ verify_script } " | base64 -w0) "
timeout 120 kubectl -n " ${ NODE_HELPER_NAMESPACE } " exec " ${ pod } " -- /bin/sh -ceu " printf '%s' ' ${ encoded_script } ' | base64 -d | chroot /host /bin/sh -seu "
}
2026-04-06 04:47:05 -03:00
run_host_command_via_helper( ) {
local node = " $1 "
local purpose = " $2 "
local timeout_seconds = " $3 "
local host_command = " $4 "
local encoded_command
encoded_command = " $( printf '%s' " ${ host_command } " | base64 -w0) "
local script_content
script_content = $( cat <<SCRIPT
set -euo pipefail
HOST_COMMAND = " \$(printf '%s' ' ${ encoded_command } ' | base64 -d) "
nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu "\${HOST_COMMAND}"
SCRIPT
)
run_helper_pod " ${ node } " " ${ purpose } " " ${ timeout_seconds } " " ${ script_content } "
}
2026-04-06 21:27:23 -03:00
run_host_command_via_prewarm_pod( ) {
local node = " $1 "
local host_command = " $2 "
local pod encoded_command
pod = " $( kubectl -n " ${ NODE_HELPER_NAMESPACE } " get pods -l app = " ${ NODE_HELPER_PREWARM_DS } " --field-selector " spec.nodeName= ${ node } " -o jsonpath = '{.items[0].metadata.name}' 2>/dev/null || true ) "
if [ [ -z " ${ pod } " ] ] ; then
return 1
fi
encoded_command = " $( printf '%s' " ${ host_command } " | base64 -w0) "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: helper exec via ${ pod } on ${ node } "
return 0
fi
run kubectl -n " ${ NODE_HELPER_NAMESPACE } " exec " ${ pod } " -- /bin/bash -ceu " HOST_COMMAND=\$(printf '%s' ' ${ encoded_command } ' | base64 -d); nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu \"\${HOST_COMMAND}\" "
}
2026-04-06 04:47:05 -03:00
schedule_host_shutdown_via_helper( ) {
local node = " $1 "
local service_name = " $2 "
local delay_seconds = " $3 "
local host_command
2026-04-06 21:27:23 -03:00
host_command = " /usr/bin/systemd-run --unit ananke-shutdown- ${ service_name } --on-active= ${ delay_seconds } s /bin/sh -lc '/usr/bin/systemctl stop ${ service_name } || true; /usr/bin/systemctl poweroff || true' "
if run_host_command_via_prewarm_pod " ${ node } " " ${ host_command } " ; then
return 0
fi
2026-04-06 04:47:05 -03:00
run_host_command_via_helper " ${ node } " " shutdown- ${ node } - ${ service_name } " 120 " ${ host_command } "
}
2026-04-07 12:30:28 -03:00
schedule_host_service_stop_via_helper( ) {
local node = " $1 "
local service_name = " $2 "
local delay_seconds = " $3 "
local host_command
host_command = " /usr/bin/systemd-run --unit ananke-stop- ${ service_name } --on-active= ${ delay_seconds } s /bin/sh -lc '/usr/bin/systemctl stop ${ service_name } || true' "
if run_host_command_via_prewarm_pod " ${ node } " " ${ host_command } " ; then
return 0
fi
run_host_command_via_helper " ${ node } " " stop- ${ node } - ${ service_name } " 120 " ${ host_command } "
}
2026-04-06 04:47:05 -03:00
prewarm_node_helper_image( ) {
2026-04-06 21:27:23 -03:00
local name = " ${ NODE_HELPER_PREWARM_DS } "
2026-04-07 12:30:28 -03:00
local ready_nodes node
local node_affinity_block = ""
2026-04-06 04:47:05 -03:00
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: prewarm ${ NODE_HELPER_IMAGE } via temporary DaemonSet "
return 0
fi
2026-04-07 12:30:28 -03:00
ready_nodes = " $( kubectl get nodes -o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status' --no-headers 2>/dev/null | awk '$2=="True" {print $1}' || true ) "
if [ [ -n " ${ ready_nodes } " ] ] ; then
node_affinity_block = $' affinity:\n nodeAffinity:\n requiredDuringSchedulingIgnoredDuringExecution:\n nodeSelectorTerms:\n - matchExpressions:\n - key: kubernetes.io/hostname\n operator: In\n values:'
while IFS = read -r node; do
[ [ -z " ${ node } " ] ] && continue
node_affinity_block += $'\n' " - ${ node } "
done <<< " ${ ready_nodes } "
log " node-helper-prewarm-targets= $( printf '%s' " ${ ready_nodes } " | paste -sd, -) "
else
warn "Unable to detect Ready nodes for prewarm targeting; continuing without node affinity."
fi
2026-04-06 04:47:05 -03:00
cat <<DS | kubectl apply -f -
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: ${ name }
namespace: ${ NODE_HELPER_NAMESPACE }
spec:
selector:
matchLabels:
app: ${ name }
template:
metadata:
labels:
app: ${ name }
spec:
imagePullSecrets:
- name: ${ REGISTRY_PULL_SECRET }
2026-04-07 12:30:28 -03:00
${ node_affinity_block }
2026-04-06 04:47:05 -03:00
tolerations:
- operator: Exists
containers:
- name: helper
image: ${ NODE_HELPER_IMAGE }
2026-06-18 18:02:32 -03:00
imagePullPolicy: IfNotPresent
2026-04-06 04:47:05 -03:00
command: [ "/bin/sh" , "-ceu" , "sleep 300" ]
DS
local i desired ready
for i in $( seq 1 90) ; do
desired = " $( kubectl -n " ${ NODE_HELPER_NAMESPACE } " get ds " ${ name } " -o jsonpath = '{.status.desiredNumberScheduled}' 2>/dev/null || echo 0) "
ready = " $( kubectl -n " ${ NODE_HELPER_NAMESPACE } " get ds " ${ name } " -o jsonpath = '{.status.numberReady}' 2>/dev/null || echo 0) "
[ [ -n " ${ desired } " ] ] || desired = 0
[ [ -n " ${ ready } " ] ] || ready = 0
if [ [ " ${ desired } " != "0" && " ${ desired } " = = " ${ ready } " ] ] ; then
log " node-helper-prewarm= ${ ready } / ${ desired } "
2026-04-06 21:27:23 -03:00
if [ [ " ${ KEEP_PREWARM_DAEMONSET } " -eq 0 ] ] ; then
kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete ds " ${ name } " --ignore-not-found >/dev/null 2>& 1 || true
else
log " Keeping ${ name } DaemonSet running for shutdown helper exec path. "
fi
2026-04-06 04:47:05 -03:00
return 0
fi
sleep 2
2026-04-06 00:22:54 -03:00
done
2026-04-06 04:47:05 -03:00
kubectl -n " ${ NODE_HELPER_NAMESPACE } " describe ds " ${ name } " >& 2 || true
kubectl -n " ${ NODE_HELPER_NAMESPACE } " get pods -l app = " ${ name } " >& 2 || true
kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete ds " ${ name } " --ignore-not-found >/dev/null 2>& 1 || true
die " Timed out prewarming node helper image ${ NODE_HELPER_IMAGE } "
2026-04-06 00:22:54 -03:00
}
2026-04-06 21:27:23 -03:00
cleanup_prewarm_daemonset( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: cleanup ${ NODE_HELPER_PREWARM_DS } DaemonSet "
return 0
fi
kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete ds " ${ NODE_HELPER_PREWARM_DS } " --ignore-not-found >/dev/null 2>& 1 || true
}
2026-04-06 04:47:05 -03:00
start_bundle_server( ) {
2026-06-18 18:02:32 -03:00
[ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] || die " Bootstrap bundle not found at ${ HARBOR_BUNDLE_FILE } "
2026-04-06 04:47:05 -03:00
require_cmd python3
local bundle_dir bundle_name
bundle_dir = " $( dirname " ${ HARBOR_BUNDLE_FILE } " ) "
bundle_name = " $( basename " ${ HARBOR_BUNDLE_FILE } " ) "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: serve ${ bundle_name } from ${ bundle_dir } on port ${ BUNDLE_HTTP_PORT } "
return 0
fi
2026-04-06 21:27:23 -03:00
python3 -m http.server " ${ BUNDLE_HTTP_PORT } " --bind 0.0.0.0 --directory " ${ bundle_dir } " </dev/null >/tmp/ananke-bundle-server.log 2>& 1 &
2026-04-06 04:47:05 -03:00
BUNDLE_SERVER_PID = $!
for _ in $( seq 1 20) ; do
if curl -fsS " http://127.0.0.1: ${ BUNDLE_HTTP_PORT } / ${ bundle_name } " >/dev/null 2>& 1; then
return 0
fi
sleep 1
2026-04-06 00:22:54 -03:00
done
2026-04-06 21:27:23 -03:00
die "Temporary bundle server did not become ready; see /tmp/ananke-bundle-server.log"
2026-04-06 00:22:54 -03:00
}
2026-04-06 04:47:05 -03:00
stop_bundle_server( ) {
if [ [ -n " ${ BUNDLE_SERVER_PID } " ] ] ; then
kill " ${ BUNDLE_SERVER_PID } " >/dev/null 2>& 1 || true
for _ in $( seq 1 10) ; do
kill -0 " ${ BUNDLE_SERVER_PID } " >/dev/null 2>& 1 || break
sleep 1
done
BUNDLE_SERVER_PID = ""
fi
}
trap stop_bundle_server EXIT
control_host_ip( ) {
2026-06-18 18:02:32 -03:00
local ip_addr
if command -v hostname >/dev/null 2>& 1; then
ip_addr = " $( hostname -I 2>/dev/null | awk '{print $1}' ) "
if [ [ -n " ${ ip_addr } " ] ] ; then
printf '%s\n' " ${ ip_addr } "
return 0
fi
fi
if command -v ip >/dev/null 2>& 1; then
ip_addr = " $( ip -4 route get 1.1.1.1 2>/dev/null | awk '{for (i=1; i<=NF; i++) if ($i=="src") {print $(i+1); exit}}' ) "
if [ [ -n " ${ ip_addr } " ] ] ; then
printf '%s\n' " ${ ip_addr } "
return 0
fi
fi
die "Unable to determine control host IP; install hostname or iproute2."
2026-04-06 04:47:05 -03:00
}
2026-06-18 18:02:32 -03:00
bootstrap_images_text( ) {
[ [ -f " ${ BOOTSTRAP_IMAGES_FILE } " ] ] || die " Bootstrap image list not found at ${ BOOTSTRAP_IMAGES_FILE } "
sed '/^[[:space:]]*#/d;/^[[:space:]]*$/d' " ${ BOOTSTRAP_IMAGES_FILE } "
}
longhorn_unlock_images_text( ) {
[ [ -f " ${ LONGHORN_UNLOCK_IMAGES_FILE } " ] ] || die " Longhorn unlock image list not found at ${ LONGHORN_UNLOCK_IMAGES_FILE } "
sed '/^[[:space:]]*#/d;/^[[:space:]]*$/d' " ${ LONGHORN_UNLOCK_IMAGES_FILE } "
}
ssh_host_for_node( ) {
local node = " $1 "
case " ${ node } " in
titan-23) printf '%s\n' "oceanus" ; ;
*) printf '%s\n' " ${ node } " ; ;
esac
}
ssh_recovery_opts( ) {
printf '%s\n' \
-o BatchMode = yes \
-o ConnectTimeout = 10 \
-o StrictHostKeyChecking = accept-new \
-o UserKnownHostsFile = " ${ LONGHORN_UNLOCK_SSH_KNOWN_HOSTS } "
}
run_ssh_longhorn_bundle_import( ) {
local node = " $1 "
local bundle_file = " $2 "
local images_text = " $3 "
local host remote_bundle host_script
local -a ssh_opts
[ [ -f " ${ bundle_file } " ] ] || die " Longhorn unlock bundle not found at ${ bundle_file } "
host = " $( ssh_host_for_node " ${ node } " ) "
remote_bundle = " /tmp/ $( basename " ${ bundle_file } " ) "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: scp ${ bundle_file } to ${ host } : ${ remote_bundle } and import into k3s containerd "
return 0
fi
mapfile -t ssh_opts < <( ssh_recovery_opts)
log " ssh-image-seed-node= ${ node } host= ${ host } bundle= $( basename " ${ bundle_file } " ) "
scp " ${ ssh_opts [@] } " " ${ bundle_file } " " ${ host } : ${ remote_bundle } "
host_script = $( cat <<SCRIPT
set -eu
bundle = '${remote_bundle}'
if [ ! -s "\${bundle}" ] ; then
echo "bundle missing or empty: \${bundle}" >& 2
exit 1
fi
while IFS = read -r image; do
[ -z "\${image}" ] && continue
/usr/bin/timeout 60 /usr/local/bin/k3s crictl rmi "\${image}" >/dev/null 2>& 1 || true
/usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images rm "\${image}" >/dev/null 2>& 1 || true
done <<'IMAGES'
${ images_text }
IMAGES
/usr/bin/zstd -dc "\${bundle}" | /usr/bin/timeout 1800 /usr/local/bin/k3s ctr -n k8s.io images import --platform linux/${ BOOTSTRAP_BUNDLE_ARCH } -
while IFS = read -r image; do
[ -z "\${image}" ] && continue
repo = "\${image%:*}"
digest_ref = "\$(/usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images ls -q | grep -E " ^\$ { repo} @sha256:" | head -n 1 || true)"
if [ -n "\${digest_ref}" ] ; then
/usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images tag --force "\${digest_ref}" "\${image}" >/dev/null 2>& 1 || true
fi
/usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images ls -q | grep -Fx "\${image}" >/dev/null
done <<'IMAGES'
${ images_text }
IMAGES
SCRIPT
)
if ! run_hostroot_pod_script " ${ node } " " longhorn-unlock-import- ${ node } " 1800 " ${ host_script } " ; then
warn " Hostroot import failed on ${ node } ; SSH staging succeeded but no sudo-capable remote import was attempted. "
return 1
fi
}
longhorn_manager_image_pull_nodes( ) {
kubectl -n longhorn-system get pods -l app = longhorn-manager -o json \
| jq -r --arg image " ${ LONGHORN_MANAGER_IMAGE } " ' .items[ ]
| select ( .spec.nodeName != null)
| select ( [ .status.containerStatuses[ ] ?.state.waiting.reason]
| map( select ( . = = "ImagePullBackOff" or . = = "ErrImagePull" ) ) | length > 0)
| select ( [ .spec.containers[ ] ?.image] | index( $image ) )
| .spec.nodeName' 2>/dev/null \
| sort -u
}
repair_longhorn_manager_cache_node( ) {
local node = " $1 "
local host remote_bundle host_script
local -a ssh_opts
[ [ -f " ${ LONGHORN_MANAGER_CACHE_BUNDLE_FILE } " ] ] || die " Longhorn manager cache bundle missing at ${ LONGHORN_MANAGER_CACHE_BUNDLE_FILE } . "
host = " $( ssh_host_for_node " ${ node } " ) "
remote_bundle = " /tmp/ $( basename " ${ LONGHORN_MANAGER_CACHE_BUNDLE_FILE } " ) "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: repair ${ LONGHORN_MANAGER_IMAGE } cache on ${ node } using ${ LONGHORN_MANAGER_CACHE_BUNDLE_FILE } "
return 0
fi
mapfile -t ssh_opts < <( ssh_recovery_opts)
log " longhorn-manager-cache-repair-node= ${ node } host= ${ host } "
scp " ${ ssh_opts [@] } " " ${ LONGHORN_MANAGER_CACHE_BUNDLE_FILE } " " ${ host } : ${ remote_bundle } "
host_script = $( cat <<SCRIPT
set -eu
image = '${LONGHORN_MANAGER_IMAGE}'
bundle = '${remote_bundle}'
if [ ! -s "\${bundle}" ] ; then
echo "manager cache bundle missing or empty: \${bundle}" >& 2
exit 1
fi
/usr/bin/timeout 60 /usr/local/bin/k3s crictl rmi "\${image}" >/dev/null 2>& 1 || true
/usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images rm "\${image}" >/dev/null 2>& 1 || true
/usr/bin/timeout 600 /usr/local/bin/k3s ctr -n k8s.io images import --platform linux/${ BOOTSTRAP_BUNDLE_ARCH } "\${bundle}"
repo = "\${image%:*}"
digest_ref = "\$(/usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images ls -q | grep -E " ^\$ { repo} @sha256:" | head -n 1 || true)"
if [ -n "\${digest_ref}" ] ; then
/usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images tag --force "\${digest_ref}" "\${image}" >/dev/null 2>& 1 || true
fi
if ! /usr/bin/timeout 60 /usr/local/bin/k3s crictl inspecti "\${image}" >/dev/null 2>& 1; then
echo "warning: CRI inspect did not see \${image}; kubelet will be verified by pod state" >& 2
fi
/usr/bin/timeout 60 /usr/local/bin/k3s ctr -n k8s.io images ls -q | grep -Fx "\${image}" >/dev/null
SCRIPT
)
run_hostroot_pod_script " ${ node } " " longhorn-manager-cache-repair- ${ node } " 900 " ${ host_script } "
}
repair_longhorn_manager_cache_deadlock( ) {
local nodes node rc = 0
nodes = " $( longhorn_manager_image_pull_nodes || true ) "
if [ [ -z " ${ nodes } " ] ] ; then
log "longhorn-manager-cache-repair=not-needed"
return 0
fi
if [ [ ! -f " ${ LONGHORN_MANAGER_CACHE_BUNDLE_FILE } " ] ] ; then
warn " Longhorn manager cache bundle not found at ${ LONGHORN_MANAGER_CACHE_BUNDLE_FILE } ; skipping surgical manager cache repair. "
return 1
fi
while IFS = read -r node; do
[ [ -z " ${ node } " ] ] && continue
repair_longhorn_manager_cache_node " ${ node } " || rc = $?
done <<< " ${ nodes } "
return " ${ rc } "
}
seed_longhorn_unlock_images_ssh( ) {
local images_text nodes node rc = 0
[ [ -f " ${ LONGHORN_UNLOCK_BUNDLE_FILE } " ] ] || die " Longhorn unlock bundle missing at ${ LONGHORN_UNLOCK_BUNDLE_FILE } . "
images_text = " $( longhorn_unlock_images_text) "
[ [ -n " ${ images_text } " ] ] || die " No Longhorn unlock images listed in ${ LONGHORN_UNLOCK_IMAGES_FILE } "
nodes = " $( list_ready_longhorn_seed_nodes) "
[ [ -n " ${ nodes } " ] ] || die " No Ready Longhorn nodes match architecture ${ BOOTSTRAP_BUNDLE_ARCH } . "
while IFS = read -r node; do
[ [ -z " ${ node } " ] ] && continue
run_ssh_longhorn_bundle_import " ${ node } " " ${ LONGHORN_UNLOCK_BUNDLE_FILE } " " ${ images_text } " || rc = $?
if [ [ " ${ rc } " -ne 0 ] ] ; then
warn " SSH image import failed on ${ node } . "
break
fi
done <<< " ${ nodes } "
return " ${ rc } "
}
list_ready_longhorn_seed_nodes( ) {
kubectl get nodes -l longhorn-host= true \
-o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,ARCH:.metadata.labels.kubernetes\.io/arch' \
--no-headers 2>/dev/null \
| awk -v arch = " ${ BOOTSTRAP_BUNDLE_ARCH } " '$2=="True" && $3==arch {print $1}'
}
list_bootstrap_seed_nodes( ) {
local nodes
nodes = " $( list_ready_longhorn_seed_nodes || true ) "
if [ [ -n " ${ HARBOR_TARGET_NODE } " ] ] && node_is_ready " ${ HARBOR_TARGET_NODE } " ; then
nodes = " $( printf '%s\n%s\n' " ${ nodes } " " ${ HARBOR_TARGET_NODE } " ) "
fi
printf '%s\n' " ${ nodes } " | sed '/^[[:space:]]*$/d' | sort -u
}
seed_bootstrap_images( ) {
local images_text control_ip bundle_name helper_script_content seed_rc = 0 node nodes
[ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] || die " Bootstrap bundle not found at ${ HARBOR_BUNDLE_FILE } "
2026-04-06 21:27:23 -03:00
ensure_harbor_target_node
2026-04-06 21:32:43 -03:00
ensure_harbor_host_label
2026-06-18 18:02:32 -03:00
images_text = " $( bootstrap_images_text) "
[ [ -n " ${ images_text } " ] ] || die " No bootstrap images listed in ${ BOOTSTRAP_IMAGES_FILE } "
nodes = " $( list_bootstrap_seed_nodes) "
[ [ -n " ${ nodes } " ] ] || die "No Ready Longhorn or Harbor bootstrap nodes available for image seed."
2026-04-06 04:47:05 -03:00
bundle_name = " $( basename " ${ HARBOR_BUNDLE_FILE } " ) "
start_bundle_server
control_ip = " $( control_host_ip) "
2026-06-18 18:02:32 -03:00
helper_script_content = $( cat <<SCRIPT
2026-04-06 04:47:05 -03:00
set -euo pipefail
2026-06-18 18:02:32 -03:00
if [ [ " ${ REFRESH_BOOTSTRAP_IMAGE_ALIASES } " = = "1" ] ] ; then
while IFS = read -r image; do
[ [ -z "\${image}" ] ] && continue
nsenter --target 1 --mount --uts --ipc --net --pid /usr/local/bin/k3s ctr images rm "\${image}" >/dev/null 2>& 1 || true
done <<'IMAGES'
${ images_text }
IMAGES
fi
2026-04-06 04:47:05 -03:00
curl -fsSL " http:// ${ control_ip } : ${ BUNDLE_HTTP_PORT } / ${ bundle_name } " \
| zstd -dc \
| nsenter --target 1 --mount --uts --ipc --net --pid /usr/local/bin/k3s ctr images import -
while IFS = read -r image; do
[ [ -z "\${image}" ] ] && continue
nsenter --target 1 --mount --uts --ipc --net --pid /usr/local/bin/k3s ctr images ls | awk '{print \$1}' | grep -Fx "\${image}" >/dev/null
done <<'IMAGES'
${ images_text }
IMAGES
SCRIPT
)
2026-06-18 18:02:32 -03:00
while IFS = read -r node; do
[ [ -n " ${ node } " ] ] || continue
log " bootstrap-image-seed-node= ${ node } "
if run_hostroot_pod_bundle_import " ${ node } " 1800 " ${ images_text } " ; then
continue
fi
warn " Hostroot seed pod unavailable or failed on ${ node } ; falling back to dedicated helper pod. "
run_helper_pod " ${ node } " " bootstrap-seed- ${ node } " 1800 " ${ helper_script_content } " || seed_rc = $?
if [ [ " ${ seed_rc } " -ne 0 ] ] ; then
break
fi
done <<< " ${ nodes } "
2026-04-06 04:47:05 -03:00
stop_bundle_server
[ [ " ${ seed_rc } " -eq 0 ] ] || return " ${ seed_rc } "
2026-06-18 18:02:32 -03:00
BOOTSTRAP_IMAGES_SEEDED = 1
mark_checkpoint startup_bootstrap_images_seeded
}
seed_bootstrap_images_if_needed( ) {
if [ [ " ${ BOOTSTRAP_IMAGES_SEEDED } " -eq 1 ] ] ; then
log "Bootstrap images already seeded during this run."
return 0
fi
if harbor_is_ready; then
log "Harbor registry API is healthy; skipping bootstrap image seed."
return 0
fi
if [ [ " ${ SKIP_HARBOR_SEED } " -ne 0 ] ] ; then
warn "Skipping bootstrap image seed/import by request."
return 0
fi
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
prewarm_node_helper_image
fi
seed_bootstrap_images
}
seed_harbor_images( ) {
seed_bootstrap_images
2026-04-06 00:22:54 -03:00
}
bootstrap_local_minimal( ) {
2026-04-06 04:47:05 -03:00
apply_kustomization infrastructure/core
apply_kustomization infrastructure/sources/helm
apply_kustomization infrastructure/longhorn/core
apply_kustomization infrastructure/metallb
apply_kustomization infrastructure/traefik
apply_kustomization infrastructure/vault-csi
apply_kustomization infrastructure/vault-injector
apply_kustomization services/vault
apply_kustomization infrastructure/postgres
apply_kustomization services/gitea
2026-04-06 00:22:54 -03:00
}
bootstrap_local_harbor( ) {
2026-04-06 04:47:05 -03:00
apply_kustomization services/harbor
2026-04-06 00:22:54 -03:00
}
2026-04-07 12:30:28 -03:00
reconcile_kustomization_with_self_heal( ) {
local item = " $1 "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
run flux reconcile kustomization " ${ item } " -n flux-system --with-source --timeout= 15m
return 0
fi
local attempt output rc
for attempt in 1 2; do
set +e
output = " $( flux reconcile kustomization " ${ item } " -n flux-system --with-source --timeout= 15m 2>& 1) "
rc = $?
set -e
if ( ( rc = = 0 ) ) ; then
[ [ -n " ${ output } " ] ] && printf '%s\n' " ${ output } "
return 0
fi
[ [ -n " ${ output } " ] ] && printf '%s\n' " ${ output } " >& 2
if ( ( attempt = = 1 ) ) && grep -Eqi 'immutable|field is immutable|cannot patch.*Job|Job.*invalid' <<< " ${ output } " ; then
warn " Flux reconcile for ' ${ item } ' failed due immutable Job/template signal. Attempting self-heal. "
heal_failed_flux_jobs || true
trigger_flux_reconcile_all || true
sleep 5
continue
fi
return " ${ rc } "
done
}
2026-04-06 04:47:05 -03:00
reconcile_stage( ) {
local stage_name = " $1 "
shift
if ! command -v flux >/dev/null 2>& 1; then
2026-04-06 00:22:54 -03:00
local now
now = " $( date --iso-8601= seconds) "
run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt= " ${ now } " --overwrite
2026-04-06 04:47:05 -03:00
return 0
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
local item
for item in " $@ " ; do
2026-04-07 12:30:28 -03:00
reconcile_kustomization_with_self_heal " ${ item } "
2026-04-06 04:47:05 -03:00
done
mark_checkpoint " reconciled_ ${ stage_name } "
2026-04-06 00:22:54 -03:00
}
2026-04-06 04:47:05 -03:00
resume_flux_and_reconcile( ) {
patch_flux_suspend_all false
if command -v flux >/dev/null 2>& 1; then
run flux reconcile source git flux-system -n flux-system --timeout= 3m
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
reconcile_stage core core helm longhorn metallb traefik vault-csi vault-injector
check_ingress_stack
check_longhorn_stack
reconcile_stage stateful vault postgres gitea
check_vault_stack
check_postgres_stack
check_gitea_stack
reconcile_stage registry harbor
check_harbor_stack
check_harbor_endpoint
run_harbor_pull_canary
}
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
status_report( ) {
2026-04-07 12:30:28 -03:00
local battery flux_ready flux_url flux_branch flux_url_drift flux_branch_drift harbor_code workers ingress_hosts_count
2026-04-09 01:41:02 -03:00
local mail_safeguards_ok
2026-04-06 21:27:23 -03:00
local effective_target effective_canary
2026-04-06 21:32:43 -03:00
local labeled_nodes
2026-04-06 04:47:05 -03:00
battery = " $( read_ups_battery || true ) "
flux_ready = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true ) "
2026-04-07 12:30:28 -03:00
flux_url = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.url}' 2>/dev/null || true ) "
flux_branch = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.ref.branch}' 2>/dev/null || true ) "
flux_url_drift = false
flux_branch_drift = false
if [ [ -n " ${ EXPECTED_FLUX_URL } " && -n " ${ flux_url } " && " ${ flux_url } " != " ${ EXPECTED_FLUX_URL } " ] ] ; then
flux_url_drift = true
fi
if [ [ -n " ${ EXPECTED_FLUX_BRANCH } " && -n " ${ flux_branch } " && " ${ flux_branch } " != " ${ EXPECTED_FLUX_BRANCH } " ] ] ; then
flux_branch_drift = true
fi
ingress_hosts_count = " $( list_ingress_hosts | sed '/^[[:space:]]*$/d' | wc -l | tr -d ' ' ) "
2026-04-06 04:47:05 -03:00
harbor_code = " $( curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true ) "
workers = " $( discover_workers_csv 2>/dev/null || true ) "
2026-04-06 21:27:23 -03:00
effective_target = " ${ HARBOR_TARGET_NODE } "
if ! node_is_ready " ${ effective_target } " ; then
effective_target = " $( select_ready_arm64_worker || true ) "
fi
effective_canary = " ${ HARBOR_CANARY_NODE } "
if ! node_is_ready " ${ effective_canary } " ; then
effective_canary = " ${ effective_target } "
fi
2026-04-06 04:47:05 -03:00
echo "mode=status"
2026-04-07 12:30:28 -03:00
echo " shutdown_mode= ${ SHUTDOWN_MODE } "
2026-04-06 04:47:05 -03:00
echo " bundle_file= ${ HARBOR_BUNDLE_FILE } "
echo " bundle_present= $( [ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] && echo true || echo false ) "
2026-06-18 18:02:32 -03:00
echo " bootstrap_images_file= ${ BOOTSTRAP_IMAGES_FILE } "
echo " bootstrap_images_file_present= $( [ [ -f " ${ BOOTSTRAP_IMAGES_FILE } " ] ] && echo true || echo false ) "
echo " bootstrap_bundle_arch= ${ BOOTSTRAP_BUNDLE_ARCH } "
2026-04-07 12:30:28 -03:00
echo " replica_snapshot_file= ${ REPLICA_SNAPSHOT_FILE } "
echo " replica_snapshot_present= $( [ [ -f " ${ REPLICA_SNAPSHOT_FILE } " ] ] && echo true || echo false ) "
2026-04-06 04:47:05 -03:00
echo " node_helper_image= ${ NODE_HELPER_IMAGE } "
2026-04-06 21:27:23 -03:00
echo " harbor_target_node= ${ effective_target :- unknown } "
echo " harbor_canary_node= ${ effective_canary :- unknown } "
2026-04-06 21:32:43 -03:00
labeled_nodes = " $( kubectl get nodes -l " ${ HARBOR_HOST_LABEL_KEY } =true " -o jsonpath = '{range .items[*]}{.metadata.name}{","}{end}' 2>/dev/null || true ) "
labeled_nodes = " ${ labeled_nodes %, } "
echo " harbor_host_label_key= ${ HARBOR_HOST_LABEL_KEY } "
echo " harbor_host_label_nodes= ${ labeled_nodes :- none } "
2026-04-06 04:47:05 -03:00
echo " workers= ${ workers } "
echo " recovery_pending= ${ RECOVERY_PENDING } "
echo " startup_attempted= ${ STARTUP_ATTEMPTED_DURING_OUTAGE } "
echo " last_checkpoint= ${ LAST_CHECKPOINT } "
echo " ups_host= ${ UPS_HOST_IN_USE :- ${ UPS_HOST } } "
echo " ups_battery= ${ battery :- unknown } "
2026-04-07 12:30:28 -03:00
echo " flux_source_expected_url= ${ EXPECTED_FLUX_URL } "
echo " flux_source_expected_branch= ${ EXPECTED_FLUX_BRANCH } "
echo " flux_source_actual_url= ${ flux_url :- unknown } "
echo " flux_source_actual_branch= ${ flux_branch :- unknown } "
echo " flux_source_url_drift= ${ flux_url_drift } "
echo " flux_source_branch_drift= ${ flux_branch_drift } "
2026-04-06 04:47:05 -03:00
echo " flux_source_ready= ${ flux_ready :- unknown } "
2026-04-07 12:30:28 -03:00
echo " ingress_hosts_count= ${ ingress_hosts_count } "
2026-04-09 01:41:02 -03:00
if check_mail_safeguards_once 1; then
mail_safeguards_ok = true
else
mail_safeguards_ok = false
fi
echo " mail_startup_safeguards_required= ${ STARTUP_REQUIRE_MAIL_SAFEGUARDS } "
echo " mail_startup_safeguards_ok= ${ mail_safeguards_ok } "
echo " mail_startup_host= ${ MAIL_STARTUP_HOST } "
echo " mail_startup_ports= ${ MAIL_STARTUP_TCP_PORTS } "
2026-04-06 04:47:05 -03:00
echo " harbor_http= ${ harbor_code :- unknown } "
kubectl get ingressclass traefik >/dev/null 2>& 1 && echo "traefik_ingressclass=true" || echo "traefik_ingressclass=false"
kubectl -n traefik get deploy traefik >/dev/null 2>& 1 && echo "traefik_deploy=true" || echo "traefik_deploy=false"
kubectl -n longhorn-system get ds longhorn-manager >/dev/null 2>& 1 && echo "longhorn_manager=true" || echo "longhorn_manager=false"
kubectl -n vault get sts vault >/dev/null 2>& 1 && echo "vault_statefulset=true" || echo "vault_statefulset=false"
kubectl -n postgres get sts postgres >/dev/null 2>& 1 && echo "postgres_statefulset=true" || echo "postgres_statefulset=false"
kubectl -n gitea get deploy gitea >/dev/null 2>& 1 && echo "gitea_deploy=true" || echo "gitea_deploy=false"
kubectl -n harbor get deploy harbor-core >/dev/null 2>& 1 && echo "harbor_deploy=true" || echo "harbor_deploy=false"
}
planned_shutdown( ) {
local workers_csv
workers_csv = " $( discover_workers_csv 2>/dev/null || true ) "
as_array_from_csv " ${ workers_csv } " WORKER_NODES
as_array_from_csv "titan-0a,titan-0b,titan-0c" CONTROL_PLANE_NODES
RECOVERY_PENDING = 1
STARTUP_ATTEMPTED_DURING_OUTAGE = 0
save_recovery_state 1 0 shutdown_started
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
2026-04-06 21:27:23 -03:00
KEEP_PREWARM_DAEMONSET = 1
2026-04-06 04:47:05 -03:00
prewarm_node_helper_image
mark_checkpoint shutdown_helper_prewarmed
fi
2026-04-06 00:22:54 -03:00
if [ [ " ${ SKIP_ETCD_SNAPSHOT } " -eq 0 ] ] ; then
2026-04-06 04:47:05 -03:00
local ts
ts = " $( date +%Y%m%d-%H%M%S) "
run_host_command_via_helper " ${ CONTROL_PLANE_NODES [0] } " "etcd-snapshot" 300 " /usr/local/bin/k3s etcd-snapshot save --name pre-shutdown- ${ ts } "
mark_checkpoint shutdown_snapshot_complete
2026-04-06 00:22:54 -03:00
else
warn "Skipping etcd snapshot by request."
fi
2026-04-07 12:30:28 -03:00
save_workload_replica_snapshot
2026-04-09 01:41:02 -03:00
if [ [ " ${ REQUIRE_NONEMPTY_REPLICA_SNAPSHOT } " = = "1" || " ${ REQUIRE_NONEMPTY_REPLICA_SNAPSHOT } " = = "true" ] ] ; then
local replica_count
replica_count = " $( replica_snapshot_count) "
if [ [ ! " ${ replica_count } " = ~ ^[ 0-9] +$ ] ] ; then
replica_count = 0
fi
if ( ( replica_count = = 0 ) ) ; then
die " Replica snapshot is empty at ${ REPLICA_SNAPSHOT_FILE } ; refusing shutdown to avoid startup restore deadlock. "
fi
fi
2026-04-07 12:30:28 -03:00
mark_checkpoint shutdown_replicas_snapshot
2026-04-06 00:22:54 -03:00
patch_flux_suspend_all true
best_effort_scale_down_apps
2026-04-06 04:47:05 -03:00
mark_checkpoint shutdown_apps_scaled_down
2026-04-06 00:22:54 -03:00
if [ [ " ${ SKIP_DRAIN } " -eq 0 ] ] ; then
best_effort_drain_workers " ${ DRAIN_TIMEOUT_SECONDS } " " ${ WORKER_NODES [@] } "
2026-04-06 04:47:05 -03:00
mark_checkpoint shutdown_workers_drained
2026-04-06 00:22:54 -03:00
else
warn "Skipping worker drain by request."
fi
2026-04-06 04:47:05 -03:00
local node
2026-04-07 12:30:28 -03:00
if [ [ " ${ SHUTDOWN_MODE } " = = "cluster-only" ] ] ; then
warn "shutdown-mode=cluster-only: stopping k3s services only; host poweroff is disabled."
else
log "shutdown-mode=host-poweroff: scheduling host poweroff after service stop."
fi
2026-04-06 04:47:05 -03:00
for node in " ${ WORKER_NODES [@] } " ; do
[ [ -z " ${ node } " ] ] && continue
2026-04-07 12:30:28 -03:00
if [ [ " ${ SHUTDOWN_MODE } " = = "cluster-only" ] ] ; then
schedule_host_service_stop_via_helper " ${ node } " k3s-agent 20
else
schedule_host_shutdown_via_helper " ${ node } " k3s-agent 20
fi
2026-04-06 04:47:05 -03:00
done
mark_checkpoint shutdown_workers_scheduled
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
for node in " ${ CONTROL_PLANE_NODES [@] } " ; do
[ [ -z " ${ node } " ] ] && continue
2026-04-07 12:30:28 -03:00
if [ [ " ${ SHUTDOWN_MODE } " = = "cluster-only" ] ] ; then
schedule_host_service_stop_via_helper " ${ node } " k3s 45
else
schedule_host_shutdown_via_helper " ${ node } " k3s 45
fi
2026-04-06 04:47:05 -03:00
done
2026-04-06 21:27:23 -03:00
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
cleanup_prewarm_daemonset
fi
2026-04-06 04:47:05 -03:00
mark_checkpoint shutdown_control_planes_scheduled
2026-04-07 12:30:28 -03:00
if [ [ " ${ SHUTDOWN_MODE } " = = "cluster-only" ] ] ; then
log "Cluster-only shutdown actions scheduled (hosts remain powered on)."
else
log "Shutdown + host poweroff actions scheduled on hosts."
fi
2026-04-06 04:47:05 -03:00
}
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
emergency_shutdown_after_outage( ) {
warn "Entering outage-aware emergency shutdown path due insufficient startup budget."
patch_flux_suspend_all true || true
best_effort_scale_down_apps || true
local workers_csv
workers_csv = " $( discover_workers_csv 2>/dev/null || true ) "
as_array_from_csv " ${ workers_csv } " WORKER_NODES
best_effort_drain_workers " ${ EMERGENCY_DRAIN_TIMEOUT_SECONDS } " " ${ WORKER_NODES [@] } " || true
planned_shutdown
}
startup_flow( ) {
if [ [ " ${ RECOVERY_PENDING } " -eq 1 ] ] ; then
if ! ensure_minimum_battery_for_bootstrap; then
if [ [ " ${ STARTUP_ATTEMPTED_DURING_OUTAGE } " -eq 1 ] ] ; then
emergency_shutdown_after_outage
exit 1
fi
warn "Startup deferred due low battery after recent outage; marking for second-outage fallback."
save_recovery_state 1 1 deferred_low_battery
2026-04-06 00:22:54 -03:00
exit 1
fi
2026-04-06 04:47:05 -03:00
STARTUP_ATTEMPTED_DURING_OUTAGE = 1
save_recovery_state 1 1 waiting_for_api
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
if ! wait_for_api; then
die "Kubernetes API did not become reachable in time."
fi
mark_checkpoint startup_api_ready
2026-04-06 00:22:54 -03:00
2026-04-06 21:32:43 -03:00
ensure_harbor_target_node
ensure_harbor_host_label
mark_checkpoint startup_harbor_host_labeled
2026-04-07 12:30:28 -03:00
if [ [ -n " ${ FORCE_FLUX_URL } " ] ] ; then
warn " Breakglass: forcing Flux source URL to ' ${ FORCE_FLUX_URL } '. "
run kubectl -n flux-system patch gitrepository flux-system --type= merge -p " {\"spec\":{\"url\":\" ${ FORCE_FLUX_URL } \"}} "
mark_checkpoint startup_flux_url_forced
fi
2026-04-06 04:47:05 -03:00
if [ [ -n " ${ FORCE_FLUX_BRANCH } " ] ] ; then
run kubectl -n flux-system patch gitrepository flux-system --type= merge -p " {\"spec\":{\"ref\":{\"branch\":\" ${ FORCE_FLUX_BRANCH } \"}}} "
mark_checkpoint startup_flux_branch_forced
fi
2026-04-06 00:22:54 -03:00
2026-04-07 12:30:28 -03:00
assert_flux_source_expected
2026-06-18 18:02:32 -03:00
seed_bootstrap_images_if_needed
2026-04-07 12:30:28 -03:00
2026-04-06 04:47:05 -03:00
if [ [ " ${ SKIP_LOCAL_BOOTSTRAP } " -eq 0 ] ] ; then
if ! kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q True; then
warn "Flux source not Ready; executing local bootstrap fallback path."
bootstrap_local_minimal
mark_checkpoint startup_local_bootstrap_complete
check_ingress_stack
check_longhorn_stack
check_vault_stack
check_postgres_stack
check_gitea_stack
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
if [ [ " ${ SKIP_HARBOR_BOOTSTRAP } " -eq 0 ] ] ; then
if harbor_is_ready; then
log "Harbor already healthy; skipping Harbor seed/bootstrap."
else
2026-06-18 18:02:32 -03:00
seed_bootstrap_images_if_needed
2026-04-06 04:47:05 -03:00
bootstrap_local_harbor
mark_checkpoint startup_local_harbor_applied
check_harbor_stack
check_harbor_endpoint
fi
else
warn "Skipping Harbor bootstrap fallback by request."
fi
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
else
warn "Skipping local bootstrap fallback by request."
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
resume_flux_and_reconcile
2026-04-07 12:30:28 -03:00
wait_for_flux_kustomizations_ready
restore_workload_replica_snapshot
2026-04-09 01:41:02 -03:00
restore_zero_scaled_helm_workloads
2026-04-07 12:30:28 -03:00
wait_for_startup_workloads_ready
wait_for_startup_service_checklist
wait_for_startup_stability_window
2026-04-06 04:47:05 -03:00
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
prewarm_node_helper_image
mark_checkpoint startup_helper_prewarmed
fi
clear_recovery_state
log "Startup flow complete."
}
prepare_flow( ) {
2026-06-18 18:02:32 -03:00
[ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] || die " Bootstrap bundle missing at ${ HARBOR_BUNDLE_FILE } . Build and copy it to the canonical control host first. "
[ [ -f " ${ BOOTSTRAP_IMAGES_FILE } " ] ] || die " Bootstrap image list missing at ${ BOOTSTRAP_IMAGES_FILE } . "
2026-04-06 21:32:43 -03:00
ensure_harbor_target_node
ensure_harbor_host_label
mark_checkpoint prepare_harbor_host_labeled
2026-04-06 04:47:05 -03:00
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
prewarm_node_helper_image
mark_checkpoint prepare_helper_prewarmed
fi
log "Prepare flow complete."
}
harbor_seed_flow( ) {
2026-06-18 18:02:32 -03:00
[ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] || die " Bootstrap bundle missing at ${ HARBOR_BUNDLE_FILE } . Build and copy it to the canonical control host first. "
[ [ -f " ${ BOOTSTRAP_IMAGES_FILE } " ] ] || die " Bootstrap image list missing at ${ BOOTSTRAP_IMAGES_FILE } . "
2026-04-06 04:47:05 -03:00
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
prewarm_node_helper_image
mark_checkpoint harbor_seed_helper_prewarmed
fi
2026-06-18 18:02:32 -03:00
seed_bootstrap_images
2026-04-06 04:47:05 -03:00
check_harbor_endpoint
run_harbor_pull_canary
2026-06-18 18:02:32 -03:00
log "Bootstrap seed flow complete."
}
longhorn_unlock_flow( ) {
require_cmd jq
[ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] || die " Bootstrap bundle missing at ${ HARBOR_BUNDLE_FILE } . Build and copy it to the canonical control host first. "
[ [ -f " ${ BOOTSTRAP_IMAGES_FILE } " ] ] || die " Bootstrap image list missing at ${ BOOTSTRAP_IMAGES_FILE } . "
if ! wait_for_api; then
die "Kubernetes API did not become reachable in time."
fi
warn "Longhorn unlock mode will not mutate Longhorn volumes, replicas, engines, disks, PVs, or PVCs."
if ! harbor_endpoint_is_ready 1; then
warn "Harbor registry API is unhealthy; using local bootstrap image cache path."
fi
REFRESH_BOOTSTRAP_IMAGE_ALIASES = 1
freeze_longhorn_deadlock_automation
ensure_longhorn_cache_first_policy
2026-06-18 18:20:22 -03:00
remove_longhorn_manager_prepull_sidecar_if_needed
2026-06-18 18:02:32 -03:00
free_longhorn_instance_manager_headroom
delete_failed_nonstorage_pods_for_headroom
repair_longhorn_manager_cache_deadlock || warn "Surgical Longhorn manager cache repair did not complete on every affected node."
if [ [ " ${ SKIP_LONGHORN_UNLOCK_BUNDLE_SEED } " -eq 0 ] ] ; then
seed_longhorn_unlock_images_ssh
else
warn "Skipping full Longhorn unlock bundle seed by operator request."
fi
restart_longhorn_image_pull_backoff_pods
recover_stuck_terminating_node_runtime_pods_after_deadlock
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
kubectl -n longhorn-system rollout status daemonset/longhorn-manager --timeout= 5m || warn "longhorn-manager DaemonSet did not fully roll out yet."
sleep 30
fi
wait_for_longhorn_control_endpoints || true
restart_stale_critical_pods_after_longhorn_unlock
restart_harbor_after_postgres_recovery || warn "Harbor did not fully recover after Postgres became ready."
if harbor_endpoint_is_ready 1; then
run_harbor_pull_canary || warn "Harbor pull canary failed after registry recovery."
restore_recovered_worker_scheduling_after_deadlock
restore_longhorn_unlock_optional_workloads
delete_safe_stale_terminating_replicaset_pods_after_deadlock
restart_image_pull_backoff_pods_after_harbor_recovery || true
resume_deadlock_automation_after_core_recovery || true
fi
report_longhorn_unlock_status
mark_checkpoint longhorn_unlock_complete
log "Longhorn unlock flow complete."
2026-04-06 04:47:05 -03:00
}
load_recovery_state
log " mode= ${ MODE } execute= ${ EXECUTE } "
2026-04-07 12:30:28 -03:00
log " shutdown-mode= ${ SHUTDOWN_MODE } "
2026-04-06 04:47:05 -03:00
log " recovery-state-file= ${ RECOVERY_STATE_FILE } "
log " bundle-file= ${ HARBOR_BUNDLE_FILE } "
2026-06-18 18:02:32 -03:00
log " bootstrap-images-file= ${ BOOTSTRAP_IMAGES_FILE } "
log " bootstrap-bundle-arch= ${ BOOTSTRAP_BUNDLE_ARCH } "
2026-04-06 04:47:05 -03:00
log " node-helper-image= ${ NODE_HELPER_IMAGE } "
2026-04-06 21:27:23 -03:00
log " harbor-target-node-config= ${ HARBOR_TARGET_NODE :- auto } "
log " harbor-canary-node-config= ${ HARBOR_CANARY_NODE :- auto } "
2026-04-06 21:32:43 -03:00
log " harbor-host-label-key= ${ HARBOR_HOST_LABEL_KEY } "
2026-04-07 12:30:28 -03:00
log " expected-flux-url= ${ EXPECTED_FLUX_URL } "
log " expected-flux-branch= ${ EXPECTED_FLUX_BRANCH } "
log " startup-optional-kustomizations= ${ STARTUP_OPTIONAL_KUSTOMIZATIONS :- none } "
2026-04-06 04:47:05 -03:00
report_flux_source_state
case " ${ MODE } " in
status)
status_report
; ;
prepare)
prepare_flow
; ;
2026-06-18 18:02:32 -03:00
bootstrap-seed| harbor-seed| longhorn-seed)
2026-04-06 04:47:05 -03:00
harbor_seed_flow
; ;
2026-06-18 18:02:32 -03:00
longhorn-unlock)
longhorn_unlock_flow
; ;
2026-04-06 04:47:05 -03:00
shutdown)
planned_shutdown
; ;
startup)
startup_flow
; ;
esac