2026-04-06 00:22:54 -03:00
#!/usr/bin/env bash
set -euo pipefail
2026-04-06 04:47:05 -03:00
SCRIPT_DIR = " $( cd " $( dirname " ${ BASH_SOURCE [0] } " ) " && pwd ) "
2026-04-06 21:27:23 -03:00
REPO_DIR = " ${ ANANKE_REPO_DIR :- $( cd " ${ SCRIPT_DIR } /.. " && pwd ) } "
2026-04-06 04:47:05 -03:00
BOOTSTRAP_DIR = " ${ SCRIPT_DIR } /bootstrap "
CONFIG_FILE = " ${ BOOTSTRAP_DIR } /recovery-config.env "
if [ [ -f " ${ CONFIG_FILE } " ] ] ; then
# shellcheck disable=SC1090
source " ${ CONFIG_FILE } "
fi
if [ [ -z " ${ KUBECONFIG :- } " && -f " ${ SCRIPT_DIR } /kubeconfig " ] ] ; then
export KUBECONFIG = " ${ SCRIPT_DIR } /kubeconfig "
fi
2026-04-06 00:22:54 -03:00
usage( ) {
2026-04-06 04:47:05 -03:00
cat <<USAGE
2026-04-06 00:22:54 -03:00
Usage:
2026-04-06 04:47:05 -03:00
scripts/cluster_power_recovery.sh <prepare| status| harbor-seed| shutdown| startup> [ options]
2026-04-06 00:22:54 -03:00
Options:
--execute Actually run commands ( default is dry-run)
2026-04-07 12:30:28 -03:00
--shutdown-mode <mode> Shutdown behavior: host-poweroff or cluster-only ( default: ${ SHUTDOWN_MODE :- host -poweroff } )
2026-04-06 04:47:05 -03:00
--expected-flux-branch <name> Expected Flux source branch during startup checks ( default: ${ DEFAULT_FLUX_BRANCH :- main } )
2026-04-07 12:30:28 -03:00
--expected-flux-url <url> Expected Flux source URL during startup checks
--allow-flux-source-mutation Required to allow --force-flux-url during startup
--force-flux-url <url> Startup: patch flux-system GitRepository URL to this value
2026-04-06 04:47:05 -03:00
--force-flux-branch <name> Startup: patch flux-system GitRepository branch to this value
--skip-etcd-snapshot Shutdown: skip etcd snapshot before shutdown
--skip-drain Shutdown: skip worker drain during shutdown
2026-04-06 00:22:54 -03:00
--skip-local-bootstrap Startup: skip local bootstrap fallback applies
--skip-harbor-bootstrap Startup: skip Harbor recovery bootstrap stage
2026-04-06 04:47:05 -03:00
--skip-harbor-seed Startup: skip Harbor image seed/import stage
--skip-helper-prewarm Prepare/Shutdown/Startup: skip node-helper prewarm
2026-04-06 00:22:54 -03:00
--min-startup-battery <pct> Minimum UPS percent required before bootstrap ( default: 35)
--ups-host <name> UPS identifier for upsc ( default: ups@localhost)
--ups-battery-key <key> UPS battery key for upsc ( default: battery.charge)
2026-04-06 04:47:05 -03:00
--recovery-state-file <path> Recovery state file for outage-aware restart logic
2026-04-07 12:30:28 -03:00
--replica-snapshot-file <path>
File used to persist workload replica snapshot across shutdown/startup
2026-04-06 04:47:05 -03:00
--harbor-bundle-file <path> Harbor bootstrap bundle on the control host
2026-04-06 21:27:23 -03:00
--harbor-target-node <name> Node that should host Harbor during bootstrap ( default: auto)
--harbor-canary-node <name> Node used for Harbor pull canary ( default: auto)
2026-04-06 21:32:43 -03:00
--harbor-host-label-key <key> Node label key used to pin Harbor bootstrap workloads ( default: ${ HARBOR_HOST_LABEL_KEY :- ananke .bstein.dev/harbor-bootstrap } )
2026-04-06 04:47:05 -03:00
--harbor-canary-image <image> Harbor-backed image used for pull canary ( default: ${ HARBOR_CANARY_IMAGE :- registry .bstein.dev/bstein/kubectl : 1 .35.0 } )
2026-04-06 21:27:23 -03:00
--node-helper-image <image> Privileged helper image used for host operations ( default: ${ NODE_HELPER_IMAGE :- registry .bstein.dev/bstein/ananke-node-helper : 0 .1.0 } )
2026-04-06 04:47:05 -03:00
--bundle-http-port <port> Temporary HTTP port used to serve bootstrap bundles ( default: ${ BUNDLE_HTTP_PORT :- 8877 } )
--api-wait-timeout <seconds> Startup: Kubernetes API wait timeout ( default: 600)
2026-04-06 00:22:54 -03:00
--drain-timeout <seconds> Worker drain timeout for normal shutdown ( default: 180)
--emergency-drain-timeout <seconds>
Worker drain timeout for emergency fallback ( default: 45)
2026-04-07 12:30:28 -03:00
--flux-ready-timeout <seconds>
Startup: max time to wait for Flux kustomizations Ready ( default: 1200)
--startup-checklist-timeout <seconds>
Startup: max time to wait for external service checklist ( default: 900)
--startup-workload-timeout <seconds>
Startup: max time to wait for workload readiness checks ( default: 900)
--startup-stability-window <seconds>
Startup: continuous healthy window required before success ( default: 180)
--startup-stability-timeout <seconds>
Startup: max time allowed to achieve the healthy window ( default: 900)
2026-04-06 00:22:54 -03:00
--require-ups-battery Hard-fail startup if UPS battery cannot be read
-h, --help Show help
Examples:
2026-04-06 04:47:05 -03:00
scripts/cluster_power_recovery.sh prepare --execute
scripts/cluster_power_recovery.sh harbor-seed --execute
scripts/cluster_power_recovery.sh status
2026-04-06 00:22:54 -03:00
scripts/cluster_power_recovery.sh shutdown --execute
scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main
USAGE
}
MODE = " ${ 1 :- } "
if [ [ -z " ${ MODE } " || " ${ MODE } " = = "-h" || " ${ MODE } " = = "--help" ] ] ; then
usage
exit 0
fi
shift || true
2026-04-06 04:47:05 -03:00
case " ${ MODE } " in
prepare| status| harbor-seed| shutdown| startup) ; ;
*)
echo " Unknown mode: ${ MODE } " >& 2
usage
exit 1
; ;
esac
2026-04-06 00:22:54 -03:00
EXECUTE = 0
2026-04-07 12:30:28 -03:00
SHUTDOWN_MODE = " ${ SHUTDOWN_MODE :- host -poweroff } "
2026-04-06 04:47:05 -03:00
EXPECTED_FLUX_BRANCH = " ${ DEFAULT_FLUX_BRANCH :- main } "
2026-04-07 12:30:28 -03:00
EXPECTED_FLUX_URL = " ${ EXPECTED_FLUX_URL :- ssh : //git@scm.bstein.dev : 2242 /bstein/titan-iac.git } "
ALLOW_FLUX_SOURCE_MUTATION = 0
FORCE_FLUX_URL = ""
2026-04-06 04:47:05 -03:00
FORCE_FLUX_BRANCH = ""
2026-04-06 00:22:54 -03:00
SKIP_ETCD_SNAPSHOT = 0
SKIP_DRAIN = 0
SKIP_LOCAL_BOOTSTRAP = 0
SKIP_HARBOR_BOOTSTRAP = 0
2026-04-06 04:47:05 -03:00
SKIP_HARBOR_SEED = 0
SKIP_HELPER_PREWARM = 0
UPS_HOST = " ${ UPS_HOST :- ups @localhost } "
UPS_BATTERY_KEY = " ${ UPS_BATTERY_KEY :- battery .charge } "
MIN_STARTUP_BATTERY = " ${ MIN_STARTUP_BATTERY :- 35 } "
REQUIRE_UPS_BATTERY = " ${ REQUIRE_UPS_BATTERY :- 0 } "
2026-04-06 00:22:54 -03:00
DRAIN_TIMEOUT_SECONDS = 180
EMERGENCY_DRAIN_TIMEOUT_SECONDS = 45
2026-04-06 04:47:05 -03:00
API_WAIT_TIMEOUT_SECONDS = 600
2026-04-07 12:30:28 -03:00
FLUX_READY_TIMEOUT_SECONDS = " ${ FLUX_READY_TIMEOUT_SECONDS :- 1200 } "
FLUX_READY_POLL_SECONDS = " ${ FLUX_READY_POLL_SECONDS :- 10 } "
STARTUP_CHECKLIST_TIMEOUT_SECONDS = " ${ STARTUP_CHECKLIST_TIMEOUT_SECONDS :- 900 } "
STARTUP_CHECKLIST_POLL_SECONDS = " ${ STARTUP_CHECKLIST_POLL_SECONDS :- 10 } "
STARTUP_WORKLOAD_TIMEOUT_SECONDS = " ${ STARTUP_WORKLOAD_TIMEOUT_SECONDS :- 900 } "
STARTUP_WORKLOAD_POLL_SECONDS = " ${ STARTUP_WORKLOAD_POLL_SECONDS :- 10 } "
STARTUP_STABILITY_WINDOW_SECONDS = " ${ STARTUP_STABILITY_WINDOW_SECONDS :- 180 } "
STARTUP_STABILITY_TIMEOUT_SECONDS = " ${ STARTUP_STABILITY_TIMEOUT_SECONDS :- 900 } "
STARTUP_STABILITY_POLL_SECONDS = " ${ STARTUP_STABILITY_POLL_SECONDS :- 10 } "
STARTUP_IGNORE_PODS_REGEX = " ${ STARTUP_IGNORE_PODS_REGEX :- } "
STARTUP_IGNORE_WORKLOADS_REGEX = " ${ STARTUP_IGNORE_WORKLOADS_REGEX :- } "
STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX = " ${ STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX :- ^(kube-system|kube-public|kube-node-lease|flux-system) $} "
STARTUP_OPTIONAL_KUSTOMIZATIONS = " ${ STARTUP_OPTIONAL_KUSTOMIZATIONS :- } "
STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS = " ${ STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS :- 10 } "
STARTUP_SERVICE_CHECKLIST = " ${ STARTUP_SERVICE_CHECKLIST :- } "
STARTUP_INCLUDE_INGRESS_CHECKS = " ${ STARTUP_INCLUDE_INGRESS_CHECKS :- 1 } "
STARTUP_INGRESS_ALLOWED_STATUSES = " ${ STARTUP_INGRESS_ALLOWED_STATUSES :- 200 ,301,302,307,308,401,403,404 } "
STARTUP_IGNORE_INGRESS_HOSTS_REGEX = " ${ STARTUP_IGNORE_INGRESS_HOSTS_REGEX :- } "
STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS = " ${ STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS :- 10 } "
SHUTDOWN_NAMESPACE_EXCLUDES_REGEX = " ${ SHUTDOWN_NAMESPACE_EXCLUDES_REGEX :- ^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance) $} "
2026-04-06 04:47:05 -03:00
BUNDLE_HTTP_PORT = " ${ BUNDLE_HTTP_PORT :- 8877 } "
2026-04-06 21:27:23 -03:00
STATE_ROOT = " ${ HOME } / ${ STATE_SUBDIR :- .local/share/ananke } "
2026-04-06 04:47:05 -03:00
RECOVERY_STATE_FILE = " ${ STATE_ROOT } /cluster_power_recovery.state "
2026-04-07 12:30:28 -03:00
REPLICA_SNAPSHOT_FILE = " ${ STATE_ROOT } /desired_workload_replicas.tsv "
2026-04-06 04:47:05 -03:00
HARBOR_BUNDLE_FILE = " ${ STATE_ROOT } /bundles/ ${ HARBOR_BUNDLE_BASENAME :- harbor -bootstrap-v2.14.1-arm64.tar.zst } "
2026-04-06 21:27:23 -03:00
HARBOR_TARGET_NODE = " ${ HARBOR_TARGET_NODE :- } "
HARBOR_CANARY_NODE = " ${ HARBOR_CANARY_NODE :- } "
2026-04-06 21:32:43 -03:00
HARBOR_HOST_LABEL_KEY = " ${ HARBOR_HOST_LABEL_KEY :- ananke .bstein.dev/harbor-bootstrap } "
2026-04-06 04:47:05 -03:00
HARBOR_CANARY_IMAGE = " ${ HARBOR_CANARY_IMAGE :- registry .bstein.dev/bstein/kubectl : 1 .35.0 } "
2026-04-06 21:27:23 -03:00
NODE_HELPER_IMAGE = " ${ NODE_HELPER_IMAGE :- registry .bstein.dev/bstein/ananke-node-helper : 0 .1.0 } "
2026-04-06 04:47:05 -03:00
NODE_HELPER_NAMESPACE = " ${ NODE_HELPER_NAMESPACE :- maintenance } "
NODE_HELPER_SERVICE_ACCOUNT = " ${ NODE_HELPER_SERVICE_ACCOUNT :- default } "
2026-04-06 21:27:23 -03:00
NODE_HELPER_PREWARM_DS = " ${ NODE_HELPER_PREWARM_DS :- ananke -node-helper-prewarm } "
2026-04-06 04:47:05 -03:00
REGISTRY_PULL_SECRET = " ${ REGISTRY_PULL_SECRET :- harbor -regcred } "
2026-04-06 21:27:23 -03:00
KEEP_PREWARM_DAEMONSET = 0
2026-04-06 00:22:54 -03:00
RECOVERY_PENDING = 0
STARTUP_ATTEMPTED_DURING_OUTAGE = 0
2026-04-06 04:47:05 -03:00
LAST_CHECKPOINT = "none"
BUNDLE_SERVER_PID = ""
UPS_HOST_IN_USE = ""
2026-04-06 00:22:54 -03:00
while [ [ $# -gt 0 ] ] ; do
case " $1 " in
--execute)
EXECUTE = 1
shift
; ;
2026-04-07 12:30:28 -03:00
--shutdown-mode)
SHUTDOWN_MODE = " ${ 2 : ?missing shutdown mode } "
shift 2
; ;
2026-04-06 04:47:05 -03:00
--expected-flux-branch)
EXPECTED_FLUX_BRANCH = " ${ 2 : ?missing branch } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
2026-04-07 12:30:28 -03:00
--expected-flux-url)
EXPECTED_FLUX_URL = " ${ 2 : ?missing flux url } "
shift 2
; ;
--allow-flux-source-mutation)
ALLOW_FLUX_SOURCE_MUTATION = 1
shift
; ;
--force-flux-url)
FORCE_FLUX_URL = " ${ 2 : ?missing flux url } "
shift 2
; ;
2026-04-06 04:47:05 -03:00
--force-flux-branch)
FORCE_FLUX_BRANCH = " ${ 2 : ?missing branch } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
--skip-etcd-snapshot)
SKIP_ETCD_SNAPSHOT = 1
shift
; ;
--skip-drain)
SKIP_DRAIN = 1
shift
; ;
--skip-local-bootstrap)
SKIP_LOCAL_BOOTSTRAP = 1
shift
; ;
--skip-harbor-bootstrap)
SKIP_HARBOR_BOOTSTRAP = 1
shift
; ;
2026-04-06 04:47:05 -03:00
--skip-harbor-seed)
SKIP_HARBOR_SEED = 1
shift
; ;
--skip-helper-prewarm)
SKIP_HELPER_PREWARM = 1
shift
2026-04-06 00:22:54 -03:00
; ;
--ups-host)
2026-04-06 04:47:05 -03:00
UPS_HOST = " ${ 2 : ?missing ups host } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
--ups-battery-key)
2026-04-06 04:47:05 -03:00
UPS_BATTERY_KEY = " ${ 2 : ?missing ups key } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
--min-startup-battery)
2026-04-06 04:47:05 -03:00
MIN_STARTUP_BATTERY = " ${ 2 : ?missing battery threshold } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
2026-04-06 04:47:05 -03:00
--require-ups-battery)
REQUIRE_UPS_BATTERY = 1
shift
; ;
2026-04-06 00:22:54 -03:00
--recovery-state-file)
2026-04-06 04:47:05 -03:00
RECOVERY_STATE_FILE = " ${ 2 : ?missing state file path } "
shift 2
; ;
2026-04-07 12:30:28 -03:00
--replica-snapshot-file)
REPLICA_SNAPSHOT_FILE = " ${ 2 : ?missing replica snapshot file path } "
shift 2
; ;
2026-04-06 04:47:05 -03:00
--harbor-bundle-file)
HARBOR_BUNDLE_FILE = " ${ 2 : ?missing bundle file path } "
shift 2
; ;
--harbor-target-node)
HARBOR_TARGET_NODE = " ${ 2 : ?missing harbor target node } "
shift 2
; ;
2026-04-06 21:27:23 -03:00
--harbor-canary-node)
HARBOR_CANARY_NODE = " ${ 2 : ?missing harbor canary node } "
shift 2
; ;
2026-04-06 21:32:43 -03:00
--harbor-host-label-key)
HARBOR_HOST_LABEL_KEY = " ${ 2 : ?missing harbor host label key } "
shift 2
; ;
2026-04-06 04:47:05 -03:00
--harbor-canary-image)
HARBOR_CANARY_IMAGE = " ${ 2 : ?missing canary image } "
shift 2
; ;
--node-helper-image)
NODE_HELPER_IMAGE = " ${ 2 : ?missing node helper image } "
shift 2
; ;
--bundle-http-port)
BUNDLE_HTTP_PORT = " ${ 2 : ?missing bundle http port } "
shift 2
; ;
--api-wait-timeout)
API_WAIT_TIMEOUT_SECONDS = " ${ 2 : ?missing api wait timeout } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
2026-04-07 12:30:28 -03:00
--flux-ready-timeout)
FLUX_READY_TIMEOUT_SECONDS = " ${ 2 : ?missing flux ready timeout } "
shift 2
; ;
--startup-checklist-timeout)
STARTUP_CHECKLIST_TIMEOUT_SECONDS = " ${ 2 : ?missing startup checklist timeout } "
shift 2
; ;
--startup-workload-timeout)
STARTUP_WORKLOAD_TIMEOUT_SECONDS = " ${ 2 : ?missing startup workload timeout } "
shift 2
; ;
--startup-stability-window)
STARTUP_STABILITY_WINDOW_SECONDS = " ${ 2 : ?missing startup stability window } "
shift 2
; ;
--startup-stability-timeout)
STARTUP_STABILITY_TIMEOUT_SECONDS = " ${ 2 : ?missing startup stability timeout } "
shift 2
; ;
2026-04-06 00:22:54 -03:00
--drain-timeout)
2026-04-06 04:47:05 -03:00
DRAIN_TIMEOUT_SECONDS = " ${ 2 : ?missing drain timeout } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
--emergency-drain-timeout)
2026-04-06 04:47:05 -03:00
EMERGENCY_DRAIN_TIMEOUT_SECONDS = " ${ 2 : ?missing emergency drain timeout } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
-h| --help)
usage
exit 0
; ;
*)
echo " Unknown option: $1 " >& 2
usage
exit 1
; ;
esac
done
2026-04-07 12:30:28 -03:00
case " ${ SHUTDOWN_MODE } " in
host-poweroff| cluster-only) ; ;
*)
echo " Invalid --shutdown-mode ' ${ SHUTDOWN_MODE } '. Expected host-poweroff or cluster-only. " >& 2
exit 1
; ;
esac
if [ [ -n " ${ FORCE_FLUX_URL } " && " ${ ALLOW_FLUX_SOURCE_MUTATION } " -ne 1 ] ] ; then
echo "--force-flux-url requires --allow-flux-source-mutation (breakglass)." >& 2
exit 1
fi
2026-04-06 00:22:54 -03:00
require_cmd( ) {
local cmd = " $1 "
if ! command -v " ${ cmd } " >/dev/null 2>& 1; then
echo " Missing required command: ${ cmd } " >& 2
exit 1
fi
}
require_cmd kubectl
2026-04-06 04:47:05 -03:00
require_cmd bash
require_cmd base64
require_cmd curl
2026-04-06 00:22:54 -03:00
log( ) { echo " [cluster-power] $* " ; }
warn( ) { echo " [cluster-power][warn] $* " >& 2; }
2026-04-06 04:47:05 -03:00
die( ) { echo " [cluster-power][error] $* " >& 2; exit 1; }
2026-04-06 00:22:54 -03:00
run( ) {
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
log " EXEC: $* "
" $@ "
else
log " DRY-RUN: $* "
fi
}
run_shell( ) {
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
log " EXEC: $* "
bash -lc " $* "
else
log " DRY-RUN: $* "
fi
}
2026-04-06 04:47:05 -03:00
apply_kustomization( ) {
local path = " $1 "
local full_path = " ${ REPO_DIR } / ${ path } "
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
log " EXEC: kubectl kustomize ${ full_path } --load-restrictor=LoadRestrictionsNone | kubectl apply -f - "
kubectl kustomize " ${ full_path } " --load-restrictor= LoadRestrictionsNone | kubectl apply -f -
2026-04-06 00:22:54 -03:00
else
2026-04-06 04:47:05 -03:00
log " DRY-RUN: kubectl kustomize ${ full_path } --load-restrictor=LoadRestrictionsNone | kubectl apply -f - "
2026-04-06 00:22:54 -03:00
fi
}
2026-04-06 04:47:05 -03:00
sanitize_name( ) {
printf '%s' " $1 " | tr '[:upper:]' '[:lower:]' | tr -cs 'a-z0-9-' '-'
2026-04-06 00:22:54 -03:00
}
2026-04-06 04:47:05 -03:00
state_dir( ) {
dirname " ${ RECOVERY_STATE_FILE } "
}
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
load_recovery_state( ) {
RECOVERY_PENDING = 0
STARTUP_ATTEMPTED_DURING_OUTAGE = 0
LAST_CHECKPOINT = "none"
[ [ -f " ${ RECOVERY_STATE_FILE } " ] ] || return 0
2026-04-06 00:22:54 -03:00
while IFS = '=' read -r key value; do
case " ${ key } " in
2026-04-06 04:47:05 -03:00
recovery_pending) RECOVERY_PENDING = " ${ value } " ; ;
startup_attempted) STARTUP_ATTEMPTED_DURING_OUTAGE = " ${ value } " ; ;
last_checkpoint) LAST_CHECKPOINT = " ${ value } " ; ;
2026-04-06 00:22:54 -03:00
esac
done < " ${ RECOVERY_STATE_FILE } "
}
save_recovery_state( ) {
2026-04-06 04:47:05 -03:00
[ [ " ${ EXECUTE } " -eq 1 ] ] || return 0
mkdir -p " $( state_dir) "
cat > " ${ RECOVERY_STATE_FILE } " <<STATE
2026-04-06 00:22:54 -03:00
recovery_pending = ${ 1 }
startup_attempted = ${ 2 }
2026-04-06 04:47:05 -03:00
last_checkpoint = ${ 3 }
STATE
}
mark_checkpoint( ) {
LAST_CHECKPOINT = " $1 "
save_recovery_state " ${ RECOVERY_PENDING } " " ${ STARTUP_ATTEMPTED_DURING_OUTAGE } " " ${ LAST_CHECKPOINT } "
2026-04-06 00:22:54 -03:00
}
clear_recovery_state( ) {
2026-04-06 04:47:05 -03:00
[ [ " ${ EXECUTE } " -eq 1 ] ] || return 0
rm -f " ${ RECOVERY_STATE_FILE } " 2>/dev/null || true
LAST_CHECKPOINT = "none"
}
sanitize_battery_percent( ) {
local raw = " $1 "
raw = " ${ raw ##* : } "
raw = " ${ raw //[[ : space : ]]/ } "
raw = " ${ raw %%.* } "
[ [ " ${ raw } " = ~ ^[ 0-9] +$ ] ] || return 1
printf '%s' " ${ raw } "
}
candidate_ups_hosts( ) {
local candidate name
local -A seen = ( )
if [ [ -n " ${ UPS_HOST } " ] ] ; then
seen[ " ${ UPS_HOST } " ] = 1
echo " ${ UPS_HOST } "
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
while IFS = read -r name; do
[ [ -n " ${ name } " ] ] || continue
for candidate in " ${ name } @localhost " " ${ name } " ; do
[ [ -n " ${ seen [ ${ candidate } ]+x } " ] ] && continue
seen[ " ${ candidate } " ] = 1
echo " ${ candidate } "
done
done < <( upsc -l 2>/dev/null || true )
2026-04-06 00:22:54 -03:00
}
read_ups_battery( ) {
if ! command -v upsc >/dev/null 2>& 1; then
return 1
fi
2026-04-06 04:47:05 -03:00
local host raw parsed
while IFS = read -r host; do
raw = " $( upsc " ${ host } " " ${ UPS_BATTERY_KEY } " 2>/dev/null || true ) "
[ [ -n " ${ raw } " ] ] || continue
parsed = " $( sanitize_battery_percent " ${ raw } " || true ) "
[ [ -n " ${ parsed } " ] ] || continue
UPS_HOST_IN_USE = " ${ host } "
printf '%s' " ${ parsed } "
return 0
done < <( candidate_ups_hosts)
return 1
2026-04-06 00:22:54 -03:00
}
ensure_minimum_battery_for_bootstrap( ) {
local battery
battery = " $( read_ups_battery || true ) "
if [ [ -z " ${ battery } " ] ] ; then
if [ [ " ${ REQUIRE_UPS_BATTERY } " -eq 1 ] ] ; then
warn "Unable to read UPS battery status and --require-ups-battery is set."
return 1
fi
warn "Unable to read UPS battery status; continuing without hard battery gating."
return 0
fi
2026-04-06 04:47:05 -03:00
log " ups-battery= ${ battery } % host= ${ UPS_HOST_IN_USE :- ${ UPS_HOST } } "
2026-04-06 00:22:54 -03:00
if ( ( battery < MIN_STARTUP_BATTERY ) ) ; then
warn " UPS battery ${ battery } % below minimum startup threshold ${ MIN_STARTUP_BATTERY } %. "
return 1
fi
return 0
}
report_flux_source_state( ) {
local flux_url flux_branch
flux_url = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.url}' 2>/dev/null || true ) "
flux_branch = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.ref.branch}' 2>/dev/null || true ) "
2026-04-06 04:47:05 -03:00
[ [ -n " ${ flux_url } " ] ] && log " flux-source-url= ${ flux_url } "
2026-04-06 00:22:54 -03:00
if [ [ -n " ${ flux_branch } " ] ] ; then
log " flux-source-branch= ${ flux_branch } "
2026-04-07 12:30:28 -03:00
fi
}
csv_has_value( ) {
local csv = " $1 "
local value = " $2 "
local needle = " , ${ value } , "
local haystack = " , ${ csv } , "
[ [ " ${ haystack } " = = *" ${ needle } " * ] ]
}
assert_flux_source_expected( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: skipping strict Flux source drift guard"
return 0
fi
local flux_url flux_branch
flux_url = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.url}' 2>/dev/null || true ) "
flux_branch = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.ref.branch}' 2>/dev/null || true ) "
[ [ -n " ${ flux_url } " ] ] || die "Unable to read Flux source URL from flux-system/gitrepository."
[ [ -n " ${ flux_branch } " ] ] || die "Unable to read Flux source branch from flux-system/gitrepository."
if [ [ -n " ${ EXPECTED_FLUX_URL } " && " ${ flux_url } " != " ${ EXPECTED_FLUX_URL } " ] ] ; then
die " Flux source URL drift detected: got ' ${ flux_url } ', expected ' ${ EXPECTED_FLUX_URL } '. Refusing startup. "
fi
if [ [ -z " ${ FORCE_FLUX_BRANCH } " && " ${ flux_branch } " != " ${ EXPECTED_FLUX_BRANCH } " ] ] ; then
die " Flux source branch drift detected: got ' ${ flux_branch } ', expected ' ${ EXPECTED_FLUX_BRANCH } '. Use --force-flux-branch to correct. "
fi
}
kustomization_is_optional( ) {
local name = " $1 "
[ [ -n " ${ STARTUP_OPTIONAL_KUSTOMIZATIONS } " ] ] || return 1
csv_has_value " ${ STARTUP_OPTIONAL_KUSTOMIZATIONS } " " ${ name } "
}
list_not_ready_kustomizations( ) {
local rows line name ready message
rows = " $( kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io \
-o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,MESSAGE:.status.conditions[?(@.type=="Ready")].message' \
--no-headers 2>/dev/null || true ) "
[ [ -n " ${ rows } " ] ] || return 0
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
name = " $( awk '{print $1}' <<< " ${ line } " ) "
ready = " $( awk '{print $2}' <<< " ${ line } " ) "
message = " ${ line # ${ name } } "
message = " ${ message # ${ ready } } "
if kustomization_is_optional " ${ name } " ; then
continue
fi
if [ [ " ${ ready } " != "True" ] ] ; then
printf '%s|%s\n' " ${ name } " " ${ message } "
fi
done <<< " ${ rows } "
}
trigger_flux_reconcile_all( ) {
local now
now = " $( date --iso-8601= seconds) "
run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt= " ${ now } " --overwrite
if command -v flux >/dev/null 2>& 1; then
run flux reconcile source git flux-system -n flux-system --timeout= 3m
fi
}
heal_failed_flux_jobs( ) {
local rows line ns name failed flux_owner helm_owner healed
healed = 0
rows = " $( kubectl get jobs.batch -A \
-o custom-columns= NS:.metadata.namespace,NAME:.metadata.name,FAILED:.status.failed,FLUX_OWNER:.metadata.labels.kustomize\\ .toolkit\\ .fluxcd\\ .io/name,HELM_OWNER:.metadata.labels.helm\\ .toolkit\\ .fluxcd\\ .io/name \
--no-headers 2>/dev/null || true ) "
[ [ -n " ${ rows } " ] ] || return 1
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
ns = " $( awk '{print $1}' <<< " ${ line } " ) "
name = " $( awk '{print $2}' <<< " ${ line } " ) "
failed = " $( awk '{print $3}' <<< " ${ line } " ) "
flux_owner = " $( awk '{print $4}' <<< " ${ line } " ) "
helm_owner = " $( awk '{print $5}' <<< " ${ line } " ) "
[ [ " ${ failed } " != "<none>" ] ] || continue
[ [ " ${ failed } " = ~ ^[ 0-9] +$ ] ] || continue
( ( failed > 0 ) ) || continue
if [ [ " ${ flux_owner } " = = "<none>" && " ${ helm_owner } " = = "<none>" ] ] ; then
continue
fi
warn " Deleting failed Flux-managed Job ${ ns } / ${ name } to heal immutable-template drift. "
run kubectl -n " ${ ns } " delete job " ${ name } " --ignore-not-found
healed = 1
done <<< " ${ rows } "
( ( healed = = 1 ) )
}
wait_for_flux_kustomizations_ready( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: skipping wait for all Flux kustomizations Ready"
return 0
fi
local start now not_ready immutable_hits
start = " $( date +%s) "
immutable_hits = 0
while true; do
not_ready = " $( list_not_ready_kustomizations || true ) "
if [ [ -z " ${ not_ready } " ] ] ; then
log "flux-kustomizations=all-ready"
return 0
fi
log "flux-kustomizations-not-ready:"
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
log " ${ line } "
done <<< " ${ not_ready } "
if grep -Eqi 'immutable|field is immutable|cannot patch.*Job|Job.*invalid' <<< " ${ not_ready } " ; then
if ( ( immutable_hits < 3 ) ) ; then
immutable_hits = $(( immutable_hits + 1 ))
warn " Detected immutable Job failure signal in Flux status. Attempting automated Job cleanup ( ${ immutable_hits } /3). "
if heal_failed_flux_jobs; then
trigger_flux_reconcile_all
fi
fi
fi
now = " $( date +%s) "
if ( ( now - start >= FLUX_READY_TIMEOUT_SECONDS ) ) ; then
die " Timed out waiting for Flux kustomizations Ready after ${ FLUX_READY_TIMEOUT_SECONDS } s. "
fi
sleep " ${ FLUX_READY_POLL_SECONDS } "
done
}
default_startup_service_checklist( ) {
cat <<'CHEC KS'
gitea| https://scm.bstein.dev/api/healthz| 200| "status" :"pass" ||
grafana| https://metrics.bstein.dev/api/health| 200| "database" :"ok" ||
harbor| https://registry.bstein.dev/v2/| 200,401|| |
CHECKS
}
list_ingress_hosts( ) {
kubectl get ingress -A -o jsonpath = '{range .items[*]}{range .spec.rules[*]}{.host}{"\n"}{end}{end}' 2>/dev/null \
| sed '/^[[:space:]]*$/d' \
| sort -u
}
generated_ingress_service_checks( ) {
local host
while IFS = read -r host; do
[ [ -n " ${ host } " ] ] || continue
if [ [ -n " ${ STARTUP_IGNORE_INGRESS_HOSTS_REGEX } " ] ] && [ [ " ${ host } " = ~ ${ STARTUP_IGNORE_INGRESS_HOSTS_REGEX } ] ] ; then
continue
fi
printf 'ingress-%s|https://%s/|%s|||0|%s\n' " ${ host } " " ${ host } " " ${ STARTUP_INGRESS_ALLOWED_STATUSES } " " ${ STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS } "
done < <( list_ingress_hosts)
}
startup_service_checklist_rows( ) {
local base
if [ [ -n " ${ STARTUP_SERVICE_CHECKLIST } " ] ] ; then
base = " $( printf '%s' " ${ STARTUP_SERVICE_CHECKLIST } " | tr ';' '\n' ) "
else
base = " $( default_startup_service_checklist) "
fi
printf '%s\n' " ${ base } " | sed '/^[[:space:]]*$/d'
if [ [ " ${ STARTUP_INCLUDE_INGRESS_CHECKS } " = = "1" || " ${ STARTUP_INCLUDE_INGRESS_CHECKS } " = = "true" ] ] ; then
generated_ingress_service_checks
fi
}
service_status_allowed( ) {
local expected_csv = " $1 "
local got = " $2 "
local token
IFS = ',' read -r -a _statuses <<< " ${ expected_csv } "
for token in " ${ _statuses [@] } " ; do
if [ [ " ${ token } " = = " ${ got } " ] ] ; then
return 0
fi
done
return 1
}
check_startup_service_checklist_once( ) {
local rows row name url expected body_must body_must_not insecure timeout code rc
local body_file failures
failures = 0
rows = " $( startup_service_checklist_rows) "
while IFS = read -r row; do
[ [ -n " ${ row } " ] ] || continue
IFS = '|' read -r name url expected body_must body_must_not insecure timeout <<< " ${ row } "
[ [ -n " ${ name } " && -n " ${ url } " && -n " ${ expected } " ] ] || continue
[ [ -n " ${ insecure } " ] ] || insecure = 0
[ [ -n " ${ timeout } " ] ] || timeout = " ${ STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS } "
body_file = " $( mktemp) "
rc = 0
if [ [ " ${ insecure } " = = "1" || " ${ insecure } " = = "true" ] ] ; then
code = " $( curl -ksS --max-time " ${ timeout } " -o " ${ body_file } " -w '%{http_code}' " ${ url } " || rc = $? ) "
else
code = " $( curl -sS --max-time " ${ timeout } " -o " ${ body_file } " -w '%{http_code}' " ${ url } " || rc = $? ) "
fi
if ( ( rc != 0 ) ) ; then
warn " startup-check ${ name } : request failed (rc= ${ rc } ) url= ${ url } "
failures = 1
rm -f " ${ body_file } "
continue
fi
if ! service_status_allowed " ${ expected } " " ${ code } " ; then
warn " startup-check ${ name } : expected status ${ expected } , got ${ code } url= ${ url } "
failures = 1
rm -f " ${ body_file } "
continue
fi
if [ [ -n " ${ body_must } " ] ] && ! grep -Fq -- " ${ body_must } " " ${ body_file } " ; then
warn " startup-check ${ name } : missing required body fragment ' ${ body_must } ' "
failures = 1
rm -f " ${ body_file } "
continue
2026-04-06 00:22:54 -03:00
fi
2026-04-07 12:30:28 -03:00
if [ [ -n " ${ body_must_not } " ] ] && grep -Fq -- " ${ body_must_not } " " ${ body_file } " ; then
warn " startup-check ${ name } : forbidden body fragment ' ${ body_must_not } ' present "
failures = 1
rm -f " ${ body_file } "
continue
fi
rm -f " ${ body_file } "
done <<< " ${ rows } "
( ( failures = = 0 ) )
}
wait_for_startup_service_checklist( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: skipping startup external service checklist wait"
return 0
2026-04-06 00:22:54 -03:00
fi
2026-04-07 12:30:28 -03:00
local start now checklist_ok workloads_ok
start = " $( date +%s) "
while true; do
checklist_ok = 0
workloads_ok = 0
if check_startup_service_checklist_once; then
checklist_ok = 1
fi
if list_unhealthy_workloads | sed '/^[[:space:]]*$/d' | grep -q .; then
workloads_ok = 0
else
workloads_ok = 1
fi
if ( ( checklist_ok = = 1 && workloads_ok = = 1 ) ) ; then
log "startup-checklist=all-passed"
return 0
fi
if ( ( workloads_ok = = 0 ) ) ; then
warn "startup-checklist: workloads are not fully ready yet."
fi
now = " $( date +%s) "
if ( ( now - start >= STARTUP_CHECKLIST_TIMEOUT_SECONDS ) ) ; then
die " Timed out waiting for startup external checklist after ${ STARTUP_CHECKLIST_TIMEOUT_SECONDS } s. "
fi
sleep " ${ STARTUP_CHECKLIST_POLL_SECONDS } "
done
}
collect_unstable_pods( ) {
local rows
rows = " $( kubectl get pods -A --no-headers 2>/dev/null \
| awk '$4 ~ /(CrashLoopBackOff|ImagePullBackOff|ErrImagePull|CreateContainerConfigError|RunContainerError|InvalidImageName)/ {print $1 "/" $2 "|" $4}' || true ) "
if [ [ -n " ${ STARTUP_IGNORE_PODS_REGEX } " ] ] ; then
rows = " $( printf '%s\n' " ${ rows } " | grep -Ev " ${ STARTUP_IGNORE_PODS_REGEX } " || true ) "
fi
printf '%s' " ${ rows } "
}
wait_for_startup_stability_window( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: skipping startup stability window"
return 0
fi
local hard_deadline stable_since now unstable pods not_ready unhealthy_workloads
stable_since = " $( date +%s) "
hard_deadline = $(( stable_since + STARTUP_STABILITY_TIMEOUT_SECONDS ))
while true; do
unstable = 0
not_ready = " $( list_not_ready_kustomizations || true ) "
if [ [ -n " ${ not_ready } " ] ] ; then
unstable = 1
warn "stability-window: Flux kustomizations not ready."
fi
pods = " $( collect_unstable_pods || true ) "
if [ [ -n " ${ pods } " ] ] ; then
unstable = 1
warn "stability-window: unstable pods detected."
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
warn " ${ line } "
done <<< " ${ pods } "
fi
if ! check_startup_service_checklist_once; then
unstable = 1
warn "stability-window: external service checklist failed."
fi
unhealthy_workloads = " $( list_unhealthy_workloads || true ) "
if [ [ -n " ${ unhealthy_workloads } " ] ] ; then
unstable = 1
warn "stability-window: workloads not fully ready."
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
warn " ${ line } "
done <<< " ${ unhealthy_workloads } "
fi
now = " $( date +%s) "
if ( ( unstable = = 0 ) ) ; then
if ( ( now - stable_since >= STARTUP_STABILITY_WINDOW_SECONDS ) ) ; then
log " startup-stability-window=passed ( ${ STARTUP_STABILITY_WINDOW_SECONDS } s) "
return 0
fi
else
stable_since = " ${ now } "
fi
if ( ( now >= hard_deadline ) ) ; then
die " Timed out waiting for startup stability window ( ${ STARTUP_STABILITY_WINDOW_SECONDS } s healthy) within ${ STARTUP_STABILITY_TIMEOUT_SECONDS } s. "
fi
sleep " ${ STARTUP_STABILITY_POLL_SECONDS } "
done
2026-04-06 00:22:54 -03:00
}
wait_for_api( ) {
2026-04-06 04:47:05 -03:00
local attempts = $(( API_WAIT_TIMEOUT_SECONDS / 5 ))
if ( ( attempts < 1 ) ) ; then
attempts = 1
fi
2026-04-06 00:22:54 -03:00
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: skipping live Kubernetes API wait"
return 0
fi
local i
for i in $( seq 1 " ${ attempts } " ) ; do
if kubectl version --request-timeout= 5s >/dev/null 2>& 1; then
return 0
fi
2026-04-06 04:47:05 -03:00
sleep 5
2026-04-06 00:22:54 -03:00
done
return 1
}
2026-04-06 04:47:05 -03:00
patch_flux_suspend_all( ) {
local value = " $1 "
local patch
patch = $( printf '{"spec":{"suspend":%s}}' " ${ value } " )
local ks_list hr_list
ks_list = " $( kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath = '{range .items[*]}{.metadata.name}{"\n"}{end}' || true ) "
hr_list = " $( kubectl get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath = '{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' || true ) "
while IFS = read -r k; do
[ [ -z " ${ k } " ] ] && continue
run kubectl -n flux-system patch kustomization " ${ k } " --type= merge -p " ${ patch } "
done <<< " ${ ks_list } "
while IFS = read -r hr; do
[ [ -z " ${ hr } " ] ] && continue
local ns = " ${ hr %%/* } "
local name = " ${ hr ##*/ } "
run kubectl -n " ${ ns } " patch helmrelease " ${ name } " --type= merge -p " ${ patch } "
done <<< " ${ hr_list } "
}
2026-04-07 12:30:28 -03:00
shutdown_namespace_excluded( ) {
local ns = " $1 "
[ [ " ${ ns } " = ~ ${ SHUTDOWN_NAMESPACE_EXCLUDES_REGEX } ] ]
}
startup_workload_namespace_excluded( ) {
local ns = " $1 "
[ [ " ${ ns } " = ~ ${ STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX } ] ]
}
2026-04-06 00:22:54 -03:00
best_effort_scale_down_apps( ) {
2026-04-07 12:30:28 -03:00
local ns_list ns
2026-04-06 00:22:54 -03:00
ns_list = " $( kubectl get ns -o jsonpath = '{range .items[*]}{.metadata.name}{"\n"}{end}' ) "
while IFS = read -r ns; do
[ [ -z " ${ ns } " ] ] && continue
2026-04-07 12:30:28 -03:00
if shutdown_namespace_excluded " ${ ns } " ; then
2026-04-06 00:22:54 -03:00
continue
fi
run_shell " kubectl -n ${ ns } scale deployment --all --replicas=0 || true "
run_shell " kubectl -n ${ ns } scale statefulset --all --replicas=0 || true "
done <<< " ${ ns_list } "
}
2026-04-07 12:30:28 -03:00
save_workload_replica_snapshot( ) {
local rows line ns kind name replicas
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: save workload replica snapshot to ${ REPLICA_SNAPSHOT_FILE } "
return 0
fi
rows = " $(
{
kubectl get deployment -A -o jsonpath = '{range .items[*]}{.metadata.namespace}{"\tdeployment\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\n"}{end}' 2>/dev/null || true
kubectl get statefulset -A -o jsonpath = '{range .items[*]}{.metadata.namespace}{"\tstatefulset\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\n"}{end}' 2>/dev/null || true
} | sed '/^[[:space:]]*$/d'
) "
mkdir -p " $( dirname " ${ REPLICA_SNAPSHOT_FILE } " ) "
: > " ${ REPLICA_SNAPSHOT_FILE } "
while IFS = $'\t' read -r ns kind name replicas; do
[ [ -n " ${ ns } " && -n " ${ kind } " && -n " ${ name } " && -n " ${ replicas } " ] ] || continue
shutdown_namespace_excluded " ${ ns } " && continue
[ [ " ${ replicas } " = ~ ^[ 0-9] +$ ] ] || continue
( ( replicas > 0 ) ) || continue
printf '%s\t%s\t%s\t%s\n' " ${ ns } " " ${ kind } " " ${ name } " " ${ replicas } " >> " ${ REPLICA_SNAPSHOT_FILE } "
done <<< " ${ rows } "
log " replica-snapshot-file= ${ REPLICA_SNAPSHOT_FILE } "
log " replica-snapshot-count= $( wc -l < " ${ REPLICA_SNAPSHOT_FILE } " | tr -d ' ' ) "
}
restore_workload_replica_snapshot( ) {
local ns kind name desired current
if [ [ " ${ RECOVERY_PENDING } " -ne 1 ] ] ; then
log "Skipping replica restore because recovery_pending=0."
return 0
fi
if [ [ ! -f " ${ REPLICA_SNAPSHOT_FILE } " ] ] ; then
warn " Replica snapshot file not found at ${ REPLICA_SNAPSHOT_FILE } ; skipping replica restore. "
return 0
fi
while IFS = $'\t' read -r ns kind name desired; do
[ [ -n " ${ ns } " && -n " ${ kind } " && -n " ${ name } " && -n " ${ desired } " ] ] || continue
[ [ " ${ desired } " = ~ ^[ 0-9] +$ ] ] || continue
( ( desired > 0 ) ) || continue
current = " $( kubectl -n " ${ ns } " get " ${ kind } " " ${ name } " -o jsonpath = '{.spec.replicas}' 2>/dev/null || true ) "
[ [ -n " ${ current } " ] ] || continue
[ [ " ${ current } " = ~ ^[ 0-9] +$ ] ] || current = 0
if ( ( current = = desired ) ) ; then
continue
fi
run kubectl -n " ${ ns } " scale " ${ kind } " " ${ name } " --replicas= " ${ desired } "
done < " ${ REPLICA_SNAPSHOT_FILE } "
mark_checkpoint startup_replicas_restored
}
list_unhealthy_workloads( ) {
local rows line ns name desired ready available
rows = " $( kubectl get deployment -A -o custom-columns= NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas --no-headers 2>/dev/null || true ) "
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
ns = " $( awk '{print $1}' <<< " ${ line } " ) "
name = " $( awk '{print $2}' <<< " ${ line } " ) "
desired = " $( awk '{print $3}' <<< " ${ line } " ) "
ready = " $( awk '{print $4}' <<< " ${ line } " ) "
available = " $( awk '{print $5}' <<< " ${ line } " ) "
startup_workload_namespace_excluded " ${ ns } " && continue
[ [ -n " ${ STARTUP_IGNORE_WORKLOADS_REGEX } " && " ${ ns } / ${ name } " = ~ ${ STARTUP_IGNORE_WORKLOADS_REGEX } ] ] && continue
[ [ " ${ desired } " = ~ ^[ 0-9] +$ ] ] || desired = 0
[ [ " ${ ready } " = ~ ^[ 0-9] +$ ] ] || ready = 0
[ [ " ${ available } " = ~ ^[ 0-9] +$ ] ] || available = 0
( ( desired > 0 ) ) || continue
if ( ( ready < desired || available < desired ) ) ; then
printf '%s/deployment/%s|ready=%s available=%s desired=%s\n' " ${ ns } " " ${ name } " " ${ ready } " " ${ available } " " ${ desired } "
fi
done <<< " ${ rows } "
rows = " $( kubectl get statefulset -A -o custom-columns= NS:.metadata.namespace,NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas --no-headers 2>/dev/null || true ) "
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
ns = " $( awk '{print $1}' <<< " ${ line } " ) "
name = " $( awk '{print $2}' <<< " ${ line } " ) "
desired = " $( awk '{print $3}' <<< " ${ line } " ) "
ready = " $( awk '{print $4}' <<< " ${ line } " ) "
startup_workload_namespace_excluded " ${ ns } " && continue
[ [ -n " ${ STARTUP_IGNORE_WORKLOADS_REGEX } " && " ${ ns } / ${ name } " = ~ ${ STARTUP_IGNORE_WORKLOADS_REGEX } ] ] && continue
[ [ " ${ desired } " = ~ ^[ 0-9] +$ ] ] || desired = 0
[ [ " ${ ready } " = ~ ^[ 0-9] +$ ] ] || ready = 0
( ( desired > 0 ) ) || continue
if ( ( ready < desired ) ) ; then
printf '%s/statefulset/%s|ready=%s desired=%s\n' " ${ ns } " " ${ name } " " ${ ready } " " ${ desired } "
fi
done <<< " ${ rows } "
}
wait_for_startup_workloads_ready( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: skipping startup workload readiness checks"
return 0
fi
local start now unhealthy
start = " $( date +%s) "
while true; do
unhealthy = " $( list_unhealthy_workloads || true ) "
if [ [ -z " ${ unhealthy } " ] ] ; then
log "startup-workloads=all-ready"
return 0
fi
warn "startup-workloads-not-ready:"
while IFS = read -r line; do
[ [ -n " ${ line } " ] ] || continue
warn " ${ line } "
done <<< " ${ unhealthy } "
now = " $( date +%s) "
if ( ( now - start >= STARTUP_WORKLOAD_TIMEOUT_SECONDS ) ) ; then
die " Timed out waiting for startup workloads Ready after ${ STARTUP_WORKLOAD_TIMEOUT_SECONDS } s. "
fi
sleep " ${ STARTUP_WORKLOAD_POLL_SECONDS } "
done
}
2026-04-06 04:47:05 -03:00
discover_workers_csv( ) {
kubectl get nodes \
2026-04-07 12:30:28 -03:00
-o 'custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\.kubernetes\.io/control-plane,MASTER:.metadata.labels.node-role\.kubernetes\.io/master,READY:.status.conditions[?(@.type=="Ready")].status' \
2026-04-06 04:47:05 -03:00
--no-headers \
2026-04-07 12:30:28 -03:00
| awk '$2=="<none>" && $3=="<none>" && $4=="True" {print $1}' \
2026-04-06 04:47:05 -03:00
| paste -sd, -
}
2026-04-06 21:27:23 -03:00
node_is_ready( ) {
local node = " $1 "
[ [ -n " ${ node } " ] ] || return 1
local ready
ready = " $( kubectl get node " ${ node } " -o jsonpath = '{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || true ) "
[ [ " ${ ready } " = = "True" ] ]
}
select_ready_arm64_worker( ) {
local rows node
rows = " $( kubectl get nodes -o 'custom-columns=NAME:.metadata.name,ARCH:.metadata.labels.kubernetes\.io/arch,WORKER:.metadata.labels.node-role\.kubernetes\.io/worker,HARDWARE:.metadata.labels.hardware,READY:.status.conditions[?(@.type=="Ready")].status' --no-headers 2>/dev/null || true ) "
[ [ -n " ${ rows } " ] ] || return 1
node = " $( printf '%s\n' " ${ rows } " | awk '$2=="arm64" && $3=="true" && $4=="rpi5" && $5=="True" {print $1; exit}' ) "
if [ [ -n " ${ node } " ] ] ; then
printf '%s' " ${ node } "
return 0
fi
node = " $( printf '%s\n' " ${ rows } " | awk '$2=="arm64" && $3=="true" && $4=="rpi4" && $5=="True" {print $1; exit}' ) "
if [ [ -n " ${ node } " ] ] ; then
printf '%s' " ${ node } "
return 0
fi
node = " $( printf '%s\n' " ${ rows } " | awk '$2=="arm64" && $3=="true" && $5=="True" {print $1; exit}' ) "
if [ [ -n " ${ node } " ] ] ; then
printf '%s' " ${ node } "
return 0
fi
return 1
}
ensure_harbor_target_node( ) {
if node_is_ready " ${ HARBOR_TARGET_NODE } " ; then
return 0
fi
local fallback
fallback = " $( select_ready_arm64_worker || true ) "
[ [ -n " ${ fallback } " ] ] || die "No Ready arm64 worker available for Harbor bootstrap target."
if [ [ -n " ${ HARBOR_TARGET_NODE } " ] ] ; then
warn " Configured harbor target node ' ${ HARBOR_TARGET_NODE } ' is not Ready; using ' ${ fallback } ' instead. "
else
log " harbor-target-node auto-selected: ${ fallback } "
fi
HARBOR_TARGET_NODE = " ${ fallback } "
}
2026-04-06 21:32:43 -03:00
ensure_harbor_host_label( ) {
[ [ -n " ${ HARBOR_TARGET_NODE } " ] ] || die "Harbor target node is not set."
local labeled node
labeled = " $( kubectl get nodes -l " ${ HARBOR_HOST_LABEL_KEY } =true " -o jsonpath = '{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true ) "
while IFS = read -r node; do
[ [ -z " ${ node } " ] ] && continue
[ [ " ${ node } " = = " ${ HARBOR_TARGET_NODE } " ] ] && continue
run kubectl label node " ${ node } " " ${ HARBOR_HOST_LABEL_KEY } - "
done <<< " ${ labeled } "
run kubectl label node " ${ HARBOR_TARGET_NODE } " " ${ HARBOR_HOST_LABEL_KEY } =true " --overwrite
}
2026-04-06 04:47:05 -03:00
as_array_from_csv( ) {
local csv = " $1 "
local out_var = " $2 "
local old_ifs = " ${ IFS } "
IFS = ',' read -r -a _tmp <<< " ${ csv } "
IFS = " ${ old_ifs } "
eval " ${ out_var } " '=( "${_tmp[@]}" )'
}
2026-04-06 00:22:54 -03:00
best_effort_drain_workers( ) {
local timeout_seconds = " $1 "
shift || true
local workers = ( " $@ " )
local node
for node in " ${ workers [@] } " ; do
[ [ -z " ${ node } " ] ] && continue
run kubectl cordon " ${ node } "
if run_shell " kubectl drain ${ node } --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout= ${ timeout_seconds } s " ; then
continue
fi
warn " Gentle drain timed out for ${ node } ; retrying with --force. "
if run_shell " kubectl drain ${ node } --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout= ${ timeout_seconds } s --force " ; then
continue
fi
warn " Force drain timed out for ${ node } ; final attempt with --disable-eviction. "
run_shell " kubectl drain ${ node } --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout= ${ timeout_seconds } s --force --disable-eviction || true "
done
}
2026-04-06 04:47:05 -03:00
wait_for_rollout( ) {
local namespace = " $1 "
local kind = " $2 "
local name = " $3 "
local timeout = " $4 "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: kubectl -n ${ namespace } rollout status ${ kind } / ${ name } --timeout= ${ timeout } "
return 0
fi
kubectl -n " ${ namespace } " rollout status " ${ kind } / ${ name } " --timeout= " ${ timeout } "
2026-04-06 00:22:54 -03:00
}
2026-04-06 04:47:05 -03:00
check_ingress_stack( ) {
kubectl get ingressclass traefik >/dev/null
wait_for_rollout traefik deployment traefik 5m
}
check_longhorn_stack( ) {
wait_for_rollout longhorn-system daemonset longhorn-manager 10m
wait_for_rollout longhorn-system deployment longhorn-ui 10m
}
check_vault_stack( ) {
wait_for_rollout vault statefulset vault 10m
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
kubectl -n vault exec vault-0 -- sh -ceu 'VAULT_ADDR=http://127.0.0.1:8200 vault status >/dev/null'
fi
}
check_postgres_stack( ) {
wait_for_rollout postgres statefulset postgres 10m
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
kubectl -n postgres exec postgres-0 -c postgres -- sh -ceu 'pg_isready -h 127.0.0.1 -p 5432 >/dev/null'
fi
}
check_gitea_stack( ) {
wait_for_rollout gitea deployment gitea 10m
}
check_harbor_stack( ) {
wait_for_rollout harbor statefulset harbor-redis 10m
wait_for_rollout harbor deployment harbor-core 10m
wait_for_rollout harbor deployment harbor-jobservice 10m
wait_for_rollout harbor deployment harbor-portal 10m
wait_for_rollout harbor deployment harbor-registry 10m
}
check_harbor_endpoint( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/"
return 0
fi
local code
code = " $( curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true ) "
case " ${ code } " in
200| 401)
log " harbor-endpoint=http- ${ code } "
; ;
*)
die " Harbor endpoint check failed with HTTP ${ code :- unknown } "
; ;
esac
}
wait_for_pod_phase( ) {
local namespace = " $1 "
local pod = " $2 "
local expected_phase = " $3 "
local timeout_seconds = " $4 "
local start now phase
start = " $( date +%s) "
while true; do
phase = " $( kubectl -n " ${ namespace } " get pod " ${ pod } " -o jsonpath = '{.status.phase}' 2>/dev/null || true ) "
if [ [ " ${ phase } " = = " ${ expected_phase } " ] ] ; then
return 0
fi
if [ [ " ${ phase } " = = "Failed" ] ] ; then
return 1
fi
now = " $( date +%s) "
if ( ( now - start >= timeout_seconds ) ) ; then
return 1
fi
sleep 2
2026-04-06 00:22:54 -03:00
done
}
2026-04-06 04:47:05 -03:00
harbor_is_ready( ) {
kubectl -n harbor get deploy harbor-core harbor-jobservice harbor-portal harbor-registry >/dev/null 2>& 1 || return 1
local code
code = " $( curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true ) "
[ [ " ${ code } " = = "200" || " ${ code } " = = "401" ] ]
}
run_harbor_pull_canary( ) {
2026-04-06 21:27:23 -03:00
local pod = "ananke-harbor-canary"
local canary_node = " ${ HARBOR_CANARY_NODE } "
if ! node_is_ready " ${ canary_node } " ; then
ensure_harbor_target_node
canary_node = " ${ HARBOR_TARGET_NODE } "
if [ [ -n " ${ HARBOR_CANARY_NODE } " ] ] ; then
warn " Configured harbor canary node ' ${ HARBOR_CANARY_NODE } ' is not Ready; using ' ${ canary_node } '. "
fi
HARBOR_CANARY_NODE = " ${ canary_node } "
fi
2026-04-06 04:47:05 -03:00
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
2026-04-06 21:27:23 -03:00
log " DRY-RUN: create Harbor pull canary pod with ${ HARBOR_CANARY_IMAGE } on ${ canary_node } "
2026-04-06 04:47:05 -03:00
return 0
fi
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
cat <<CANARY | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: ${ pod }
namespace: ${ NODE_HELPER_NAMESPACE }
spec:
2026-04-06 21:27:23 -03:00
nodeName: ${ canary_node }
2026-04-06 04:47:05 -03:00
restartPolicy: Never
imagePullSecrets:
- name: ${ REGISTRY_PULL_SECRET }
tolerations:
- operator: Exists
containers:
- name: canary
image: ${ HARBOR_CANARY_IMAGE }
imagePullPolicy: Always
command: [ "sh" , "-ceu" , "echo harbor-canary-ok" ]
CANARY
if ! wait_for_pod_phase " ${ NODE_HELPER_NAMESPACE } " " ${ pod } " Succeeded 180; then
kubectl -n " ${ NODE_HELPER_NAMESPACE } " describe pod " ${ pod } " >& 2 || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " logs " ${ pod } " >& 2 || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
return 1
fi
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " logs " ${ pod } " || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
}
run_helper_pod( ) {
local node = " $1 "
local purpose = " $2 "
local timeout_seconds = " $3 "
local script_content = " $4 "
2026-04-06 21:27:23 -03:00
local pod = " ananke- $( sanitize_name " ${ purpose } " ) - $( date +%H%M%S) "
2026-04-06 04:47:05 -03:00
local encoded_script
encoded_script = " $( printf '%s' " ${ script_content } " | base64 -w0) "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: helper pod ${ pod } on ${ node } for ${ purpose } "
return 0
fi
cat <<POD | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: ${ pod }
namespace: ${ NODE_HELPER_NAMESPACE }
spec:
nodeName: ${ node }
restartPolicy: Never
serviceAccountName: ${ NODE_HELPER_SERVICE_ACCOUNT }
imagePullSecrets:
- name: ${ REGISTRY_PULL_SECRET }
hostNetwork: true
hostPID: true
tolerations:
- operator: Exists
containers:
- name: helper
image: ${ NODE_HELPER_IMAGE }
imagePullPolicy: IfNotPresent
securityContext:
privileged: true
command: [ "/bin/bash" , "-ceu" ]
args:
- |
2026-04-06 21:27:23 -03:00
printf '%s' '${encoded_script}' | base64 -d >/tmp/ananke-step.sh
chmod +x /tmp/ananke-step.sh
/tmp/ananke-step.sh
2026-04-06 04:47:05 -03:00
POD
if ! wait_for_pod_phase " ${ NODE_HELPER_NAMESPACE } " " ${ pod } " Succeeded " ${ timeout_seconds } " ; then
kubectl -n " ${ NODE_HELPER_NAMESPACE } " describe pod " ${ pod } " >& 2 || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " logs " ${ pod } " >& 2 || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
return 1
fi
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " logs " ${ pod } " || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
}
run_host_command_via_helper( ) {
local node = " $1 "
local purpose = " $2 "
local timeout_seconds = " $3 "
local host_command = " $4 "
local encoded_command
encoded_command = " $( printf '%s' " ${ host_command } " | base64 -w0) "
local script_content
script_content = $( cat <<SCRIPT
set -euo pipefail
HOST_COMMAND = " \$(printf '%s' ' ${ encoded_command } ' | base64 -d) "
nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu "\${HOST_COMMAND}"
SCRIPT
)
run_helper_pod " ${ node } " " ${ purpose } " " ${ timeout_seconds } " " ${ script_content } "
}
2026-04-06 21:27:23 -03:00
run_host_command_via_prewarm_pod( ) {
local node = " $1 "
local host_command = " $2 "
local pod encoded_command
pod = " $( kubectl -n " ${ NODE_HELPER_NAMESPACE } " get pods -l app = " ${ NODE_HELPER_PREWARM_DS } " --field-selector " spec.nodeName= ${ node } " -o jsonpath = '{.items[0].metadata.name}' 2>/dev/null || true ) "
if [ [ -z " ${ pod } " ] ] ; then
return 1
fi
encoded_command = " $( printf '%s' " ${ host_command } " | base64 -w0) "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: helper exec via ${ pod } on ${ node } "
return 0
fi
run kubectl -n " ${ NODE_HELPER_NAMESPACE } " exec " ${ pod } " -- /bin/bash -ceu " HOST_COMMAND=\$(printf '%s' ' ${ encoded_command } ' | base64 -d); nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu \"\${HOST_COMMAND}\" "
}
2026-04-06 04:47:05 -03:00
schedule_host_shutdown_via_helper( ) {
local node = " $1 "
local service_name = " $2 "
local delay_seconds = " $3 "
local host_command
2026-04-06 21:27:23 -03:00
host_command = " /usr/bin/systemd-run --unit ananke-shutdown- ${ service_name } --on-active= ${ delay_seconds } s /bin/sh -lc '/usr/bin/systemctl stop ${ service_name } || true; /usr/bin/systemctl poweroff || true' "
if run_host_command_via_prewarm_pod " ${ node } " " ${ host_command } " ; then
return 0
fi
2026-04-06 04:47:05 -03:00
run_host_command_via_helper " ${ node } " " shutdown- ${ node } - ${ service_name } " 120 " ${ host_command } "
}
2026-04-07 12:30:28 -03:00
schedule_host_service_stop_via_helper( ) {
local node = " $1 "
local service_name = " $2 "
local delay_seconds = " $3 "
local host_command
host_command = " /usr/bin/systemd-run --unit ananke-stop- ${ service_name } --on-active= ${ delay_seconds } s /bin/sh -lc '/usr/bin/systemctl stop ${ service_name } || true' "
if run_host_command_via_prewarm_pod " ${ node } " " ${ host_command } " ; then
return 0
fi
run_host_command_via_helper " ${ node } " " stop- ${ node } - ${ service_name } " 120 " ${ host_command } "
}
2026-04-06 04:47:05 -03:00
prewarm_node_helper_image( ) {
2026-04-06 21:27:23 -03:00
local name = " ${ NODE_HELPER_PREWARM_DS } "
2026-04-07 12:30:28 -03:00
local ready_nodes node
local node_affinity_block = ""
2026-04-06 04:47:05 -03:00
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: prewarm ${ NODE_HELPER_IMAGE } via temporary DaemonSet "
return 0
fi
2026-04-07 12:30:28 -03:00
ready_nodes = " $( kubectl get nodes -o 'custom-columns=NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status' --no-headers 2>/dev/null | awk '$2=="True" {print $1}' || true ) "
if [ [ -n " ${ ready_nodes } " ] ] ; then
node_affinity_block = $' affinity:\n nodeAffinity:\n requiredDuringSchedulingIgnoredDuringExecution:\n nodeSelectorTerms:\n - matchExpressions:\n - key: kubernetes.io/hostname\n operator: In\n values:'
while IFS = read -r node; do
[ [ -z " ${ node } " ] ] && continue
node_affinity_block += $'\n' " - ${ node } "
done <<< " ${ ready_nodes } "
log " node-helper-prewarm-targets= $( printf '%s' " ${ ready_nodes } " | paste -sd, -) "
else
warn "Unable to detect Ready nodes for prewarm targeting; continuing without node affinity."
fi
2026-04-06 04:47:05 -03:00
cat <<DS | kubectl apply -f -
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: ${ name }
namespace: ${ NODE_HELPER_NAMESPACE }
spec:
selector:
matchLabels:
app: ${ name }
template:
metadata:
labels:
app: ${ name }
spec:
imagePullSecrets:
- name: ${ REGISTRY_PULL_SECRET }
2026-04-07 12:30:28 -03:00
${ node_affinity_block }
2026-04-06 04:47:05 -03:00
tolerations:
- operator: Exists
containers:
- name: helper
image: ${ NODE_HELPER_IMAGE }
imagePullPolicy: Always
command: [ "/bin/sh" , "-ceu" , "sleep 300" ]
DS
local i desired ready
for i in $( seq 1 90) ; do
desired = " $( kubectl -n " ${ NODE_HELPER_NAMESPACE } " get ds " ${ name } " -o jsonpath = '{.status.desiredNumberScheduled}' 2>/dev/null || echo 0) "
ready = " $( kubectl -n " ${ NODE_HELPER_NAMESPACE } " get ds " ${ name } " -o jsonpath = '{.status.numberReady}' 2>/dev/null || echo 0) "
[ [ -n " ${ desired } " ] ] || desired = 0
[ [ -n " ${ ready } " ] ] || ready = 0
if [ [ " ${ desired } " != "0" && " ${ desired } " = = " ${ ready } " ] ] ; then
log " node-helper-prewarm= ${ ready } / ${ desired } "
2026-04-06 21:27:23 -03:00
if [ [ " ${ KEEP_PREWARM_DAEMONSET } " -eq 0 ] ] ; then
kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete ds " ${ name } " --ignore-not-found >/dev/null 2>& 1 || true
else
log " Keeping ${ name } DaemonSet running for shutdown helper exec path. "
fi
2026-04-06 04:47:05 -03:00
return 0
fi
sleep 2
2026-04-06 00:22:54 -03:00
done
2026-04-06 04:47:05 -03:00
kubectl -n " ${ NODE_HELPER_NAMESPACE } " describe ds " ${ name } " >& 2 || true
kubectl -n " ${ NODE_HELPER_NAMESPACE } " get pods -l app = " ${ name } " >& 2 || true
kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete ds " ${ name } " --ignore-not-found >/dev/null 2>& 1 || true
die " Timed out prewarming node helper image ${ NODE_HELPER_IMAGE } "
2026-04-06 00:22:54 -03:00
}
2026-04-06 21:27:23 -03:00
cleanup_prewarm_daemonset( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: cleanup ${ NODE_HELPER_PREWARM_DS } DaemonSet "
return 0
fi
kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete ds " ${ NODE_HELPER_PREWARM_DS } " --ignore-not-found >/dev/null 2>& 1 || true
}
2026-04-06 04:47:05 -03:00
start_bundle_server( ) {
[ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] || die " Harbor bundle not found at ${ HARBOR_BUNDLE_FILE } "
require_cmd python3
local bundle_dir bundle_name
bundle_dir = " $( dirname " ${ HARBOR_BUNDLE_FILE } " ) "
bundle_name = " $( basename " ${ HARBOR_BUNDLE_FILE } " ) "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: serve ${ bundle_name } from ${ bundle_dir } on port ${ BUNDLE_HTTP_PORT } "
return 0
fi
2026-04-06 21:27:23 -03:00
python3 -m http.server " ${ BUNDLE_HTTP_PORT } " --bind 0.0.0.0 --directory " ${ bundle_dir } " </dev/null >/tmp/ananke-bundle-server.log 2>& 1 &
2026-04-06 04:47:05 -03:00
BUNDLE_SERVER_PID = $!
for _ in $( seq 1 20) ; do
if curl -fsS " http://127.0.0.1: ${ BUNDLE_HTTP_PORT } / ${ bundle_name } " >/dev/null 2>& 1; then
return 0
fi
sleep 1
2026-04-06 00:22:54 -03:00
done
2026-04-06 21:27:23 -03:00
die "Temporary bundle server did not become ready; see /tmp/ananke-bundle-server.log"
2026-04-06 00:22:54 -03:00
}
2026-04-06 04:47:05 -03:00
stop_bundle_server( ) {
if [ [ -n " ${ BUNDLE_SERVER_PID } " ] ] ; then
kill " ${ BUNDLE_SERVER_PID } " >/dev/null 2>& 1 || true
for _ in $( seq 1 10) ; do
kill -0 " ${ BUNDLE_SERVER_PID } " >/dev/null 2>& 1 || break
sleep 1
done
BUNDLE_SERVER_PID = ""
fi
}
trap stop_bundle_server EXIT
control_host_ip( ) {
hostname -I | awk '{print $1}'
}
seed_harbor_images( ) {
local images_text control_ip bundle_name script_content seed_rc = 0
[ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] || die " Harbor bundle not found at ${ HARBOR_BUNDLE_FILE } "
2026-04-06 21:27:23 -03:00
ensure_harbor_target_node
2026-04-06 21:32:43 -03:00
ensure_harbor_host_label
2026-04-06 04:47:05 -03:00
images_text = " $( sed '/^[[:space:]]*#/d;/^[[:space:]]*$/d' " ${ BOOTSTRAP_DIR } /harbor-bootstrap-images.txt " ) "
[ [ -n " ${ images_text } " ] ] || die " No Harbor images listed in ${ BOOTSTRAP_DIR } /harbor-bootstrap-images.txt "
bundle_name = " $( basename " ${ HARBOR_BUNDLE_FILE } " ) "
start_bundle_server
control_ip = " $( control_host_ip) "
script_content = $( cat <<SCRIPT
set -euo pipefail
curl -fsSL " http:// ${ control_ip } : ${ BUNDLE_HTTP_PORT } / ${ bundle_name } " \
| zstd -dc \
| nsenter --target 1 --mount --uts --ipc --net --pid /usr/local/bin/k3s ctr images import -
while IFS = read -r image; do
[ [ -z "\${image}" ] ] && continue
nsenter --target 1 --mount --uts --ipc --net --pid /usr/local/bin/k3s ctr images ls | awk '{print \$1}' | grep -Fx "\${image}" >/dev/null
done <<'IMAGES'
${ images_text }
IMAGES
SCRIPT
)
run_helper_pod " ${ HARBOR_TARGET_NODE } " "harbor-seed" 900 " ${ script_content } " || seed_rc = $?
stop_bundle_server
[ [ " ${ seed_rc } " -eq 0 ] ] || return " ${ seed_rc } "
mark_checkpoint startup_harbor_seeded
2026-04-06 00:22:54 -03:00
}
bootstrap_local_minimal( ) {
2026-04-06 04:47:05 -03:00
apply_kustomization infrastructure/core
apply_kustomization infrastructure/sources/helm
apply_kustomization infrastructure/longhorn/core
apply_kustomization infrastructure/metallb
apply_kustomization infrastructure/traefik
apply_kustomization infrastructure/vault-csi
apply_kustomization infrastructure/vault-injector
apply_kustomization services/vault
apply_kustomization infrastructure/postgres
apply_kustomization services/gitea
2026-04-06 00:22:54 -03:00
}
bootstrap_local_harbor( ) {
2026-04-06 04:47:05 -03:00
apply_kustomization services/harbor
2026-04-06 00:22:54 -03:00
}
2026-04-07 12:30:28 -03:00
reconcile_kustomization_with_self_heal( ) {
local item = " $1 "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
run flux reconcile kustomization " ${ item } " -n flux-system --with-source --timeout= 15m
return 0
fi
local attempt output rc
for attempt in 1 2; do
set +e
output = " $( flux reconcile kustomization " ${ item } " -n flux-system --with-source --timeout= 15m 2>& 1) "
rc = $?
set -e
if ( ( rc = = 0 ) ) ; then
[ [ -n " ${ output } " ] ] && printf '%s\n' " ${ output } "
return 0
fi
[ [ -n " ${ output } " ] ] && printf '%s\n' " ${ output } " >& 2
if ( ( attempt = = 1 ) ) && grep -Eqi 'immutable|field is immutable|cannot patch.*Job|Job.*invalid' <<< " ${ output } " ; then
warn " Flux reconcile for ' ${ item } ' failed due immutable Job/template signal. Attempting self-heal. "
heal_failed_flux_jobs || true
trigger_flux_reconcile_all || true
sleep 5
continue
fi
return " ${ rc } "
done
}
2026-04-06 04:47:05 -03:00
reconcile_stage( ) {
local stage_name = " $1 "
shift
if ! command -v flux >/dev/null 2>& 1; then
2026-04-06 00:22:54 -03:00
local now
now = " $( date --iso-8601= seconds) "
run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt= " ${ now } " --overwrite
2026-04-06 04:47:05 -03:00
return 0
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
local item
for item in " $@ " ; do
2026-04-07 12:30:28 -03:00
reconcile_kustomization_with_self_heal " ${ item } "
2026-04-06 04:47:05 -03:00
done
mark_checkpoint " reconciled_ ${ stage_name } "
2026-04-06 00:22:54 -03:00
}
2026-04-06 04:47:05 -03:00
resume_flux_and_reconcile( ) {
patch_flux_suspend_all false
if command -v flux >/dev/null 2>& 1; then
run flux reconcile source git flux-system -n flux-system --timeout= 3m
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
reconcile_stage core core helm longhorn metallb traefik vault-csi vault-injector
check_ingress_stack
check_longhorn_stack
reconcile_stage stateful vault postgres gitea
check_vault_stack
check_postgres_stack
check_gitea_stack
reconcile_stage registry harbor
check_harbor_stack
check_harbor_endpoint
run_harbor_pull_canary
}
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
status_report( ) {
2026-04-07 12:30:28 -03:00
local battery flux_ready flux_url flux_branch flux_url_drift flux_branch_drift harbor_code workers ingress_hosts_count
2026-04-06 21:27:23 -03:00
local effective_target effective_canary
2026-04-06 21:32:43 -03:00
local labeled_nodes
2026-04-06 04:47:05 -03:00
battery = " $( read_ups_battery || true ) "
flux_ready = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true ) "
2026-04-07 12:30:28 -03:00
flux_url = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.url}' 2>/dev/null || true ) "
flux_branch = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.ref.branch}' 2>/dev/null || true ) "
flux_url_drift = false
flux_branch_drift = false
if [ [ -n " ${ EXPECTED_FLUX_URL } " && -n " ${ flux_url } " && " ${ flux_url } " != " ${ EXPECTED_FLUX_URL } " ] ] ; then
flux_url_drift = true
fi
if [ [ -n " ${ EXPECTED_FLUX_BRANCH } " && -n " ${ flux_branch } " && " ${ flux_branch } " != " ${ EXPECTED_FLUX_BRANCH } " ] ] ; then
flux_branch_drift = true
fi
ingress_hosts_count = " $( list_ingress_hosts | sed '/^[[:space:]]*$/d' | wc -l | tr -d ' ' ) "
2026-04-06 04:47:05 -03:00
harbor_code = " $( curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true ) "
workers = " $( discover_workers_csv 2>/dev/null || true ) "
2026-04-06 21:27:23 -03:00
effective_target = " ${ HARBOR_TARGET_NODE } "
if ! node_is_ready " ${ effective_target } " ; then
effective_target = " $( select_ready_arm64_worker || true ) "
fi
effective_canary = " ${ HARBOR_CANARY_NODE } "
if ! node_is_ready " ${ effective_canary } " ; then
effective_canary = " ${ effective_target } "
fi
2026-04-06 04:47:05 -03:00
echo "mode=status"
2026-04-07 12:30:28 -03:00
echo " shutdown_mode= ${ SHUTDOWN_MODE } "
2026-04-06 04:47:05 -03:00
echo " bundle_file= ${ HARBOR_BUNDLE_FILE } "
echo " bundle_present= $( [ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] && echo true || echo false ) "
2026-04-07 12:30:28 -03:00
echo " replica_snapshot_file= ${ REPLICA_SNAPSHOT_FILE } "
echo " replica_snapshot_present= $( [ [ -f " ${ REPLICA_SNAPSHOT_FILE } " ] ] && echo true || echo false ) "
2026-04-06 04:47:05 -03:00
echo " node_helper_image= ${ NODE_HELPER_IMAGE } "
2026-04-06 21:27:23 -03:00
echo " harbor_target_node= ${ effective_target :- unknown } "
echo " harbor_canary_node= ${ effective_canary :- unknown } "
2026-04-06 21:32:43 -03:00
labeled_nodes = " $( kubectl get nodes -l " ${ HARBOR_HOST_LABEL_KEY } =true " -o jsonpath = '{range .items[*]}{.metadata.name}{","}{end}' 2>/dev/null || true ) "
labeled_nodes = " ${ labeled_nodes %, } "
echo " harbor_host_label_key= ${ HARBOR_HOST_LABEL_KEY } "
echo " harbor_host_label_nodes= ${ labeled_nodes :- none } "
2026-04-06 04:47:05 -03:00
echo " workers= ${ workers } "
echo " recovery_pending= ${ RECOVERY_PENDING } "
echo " startup_attempted= ${ STARTUP_ATTEMPTED_DURING_OUTAGE } "
echo " last_checkpoint= ${ LAST_CHECKPOINT } "
echo " ups_host= ${ UPS_HOST_IN_USE :- ${ UPS_HOST } } "
echo " ups_battery= ${ battery :- unknown } "
2026-04-07 12:30:28 -03:00
echo " flux_source_expected_url= ${ EXPECTED_FLUX_URL } "
echo " flux_source_expected_branch= ${ EXPECTED_FLUX_BRANCH } "
echo " flux_source_actual_url= ${ flux_url :- unknown } "
echo " flux_source_actual_branch= ${ flux_branch :- unknown } "
echo " flux_source_url_drift= ${ flux_url_drift } "
echo " flux_source_branch_drift= ${ flux_branch_drift } "
2026-04-06 04:47:05 -03:00
echo " flux_source_ready= ${ flux_ready :- unknown } "
2026-04-07 12:30:28 -03:00
echo " ingress_hosts_count= ${ ingress_hosts_count } "
2026-04-06 04:47:05 -03:00
echo " harbor_http= ${ harbor_code :- unknown } "
kubectl get ingressclass traefik >/dev/null 2>& 1 && echo "traefik_ingressclass=true" || echo "traefik_ingressclass=false"
kubectl -n traefik get deploy traefik >/dev/null 2>& 1 && echo "traefik_deploy=true" || echo "traefik_deploy=false"
kubectl -n longhorn-system get ds longhorn-manager >/dev/null 2>& 1 && echo "longhorn_manager=true" || echo "longhorn_manager=false"
kubectl -n vault get sts vault >/dev/null 2>& 1 && echo "vault_statefulset=true" || echo "vault_statefulset=false"
kubectl -n postgres get sts postgres >/dev/null 2>& 1 && echo "postgres_statefulset=true" || echo "postgres_statefulset=false"
kubectl -n gitea get deploy gitea >/dev/null 2>& 1 && echo "gitea_deploy=true" || echo "gitea_deploy=false"
kubectl -n harbor get deploy harbor-core >/dev/null 2>& 1 && echo "harbor_deploy=true" || echo "harbor_deploy=false"
}
planned_shutdown( ) {
local workers_csv
workers_csv = " $( discover_workers_csv 2>/dev/null || true ) "
as_array_from_csv " ${ workers_csv } " WORKER_NODES
as_array_from_csv "titan-0a,titan-0b,titan-0c" CONTROL_PLANE_NODES
RECOVERY_PENDING = 1
STARTUP_ATTEMPTED_DURING_OUTAGE = 0
save_recovery_state 1 0 shutdown_started
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
2026-04-06 21:27:23 -03:00
KEEP_PREWARM_DAEMONSET = 1
2026-04-06 04:47:05 -03:00
prewarm_node_helper_image
mark_checkpoint shutdown_helper_prewarmed
fi
2026-04-06 00:22:54 -03:00
if [ [ " ${ SKIP_ETCD_SNAPSHOT } " -eq 0 ] ] ; then
2026-04-06 04:47:05 -03:00
local ts
ts = " $( date +%Y%m%d-%H%M%S) "
run_host_command_via_helper " ${ CONTROL_PLANE_NODES [0] } " "etcd-snapshot" 300 " /usr/local/bin/k3s etcd-snapshot save --name pre-shutdown- ${ ts } "
mark_checkpoint shutdown_snapshot_complete
2026-04-06 00:22:54 -03:00
else
warn "Skipping etcd snapshot by request."
fi
2026-04-07 12:30:28 -03:00
save_workload_replica_snapshot
mark_checkpoint shutdown_replicas_snapshot
2026-04-06 00:22:54 -03:00
patch_flux_suspend_all true
best_effort_scale_down_apps
2026-04-06 04:47:05 -03:00
mark_checkpoint shutdown_apps_scaled_down
2026-04-06 00:22:54 -03:00
if [ [ " ${ SKIP_DRAIN } " -eq 0 ] ] ; then
best_effort_drain_workers " ${ DRAIN_TIMEOUT_SECONDS } " " ${ WORKER_NODES [@] } "
2026-04-06 04:47:05 -03:00
mark_checkpoint shutdown_workers_drained
2026-04-06 00:22:54 -03:00
else
warn "Skipping worker drain by request."
fi
2026-04-06 04:47:05 -03:00
local node
2026-04-07 12:30:28 -03:00
if [ [ " ${ SHUTDOWN_MODE } " = = "cluster-only" ] ] ; then
warn "shutdown-mode=cluster-only: stopping k3s services only; host poweroff is disabled."
else
log "shutdown-mode=host-poweroff: scheduling host poweroff after service stop."
fi
2026-04-06 04:47:05 -03:00
for node in " ${ WORKER_NODES [@] } " ; do
[ [ -z " ${ node } " ] ] && continue
2026-04-07 12:30:28 -03:00
if [ [ " ${ SHUTDOWN_MODE } " = = "cluster-only" ] ] ; then
schedule_host_service_stop_via_helper " ${ node } " k3s-agent 20
else
schedule_host_shutdown_via_helper " ${ node } " k3s-agent 20
fi
2026-04-06 04:47:05 -03:00
done
mark_checkpoint shutdown_workers_scheduled
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
for node in " ${ CONTROL_PLANE_NODES [@] } " ; do
[ [ -z " ${ node } " ] ] && continue
2026-04-07 12:30:28 -03:00
if [ [ " ${ SHUTDOWN_MODE } " = = "cluster-only" ] ] ; then
schedule_host_service_stop_via_helper " ${ node } " k3s 45
else
schedule_host_shutdown_via_helper " ${ node } " k3s 45
fi
2026-04-06 04:47:05 -03:00
done
2026-04-06 21:27:23 -03:00
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
cleanup_prewarm_daemonset
fi
2026-04-06 04:47:05 -03:00
mark_checkpoint shutdown_control_planes_scheduled
2026-04-07 12:30:28 -03:00
if [ [ " ${ SHUTDOWN_MODE } " = = "cluster-only" ] ] ; then
log "Cluster-only shutdown actions scheduled (hosts remain powered on)."
else
log "Shutdown + host poweroff actions scheduled on hosts."
fi
2026-04-06 04:47:05 -03:00
}
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
emergency_shutdown_after_outage( ) {
warn "Entering outage-aware emergency shutdown path due insufficient startup budget."
patch_flux_suspend_all true || true
best_effort_scale_down_apps || true
local workers_csv
workers_csv = " $( discover_workers_csv 2>/dev/null || true ) "
as_array_from_csv " ${ workers_csv } " WORKER_NODES
best_effort_drain_workers " ${ EMERGENCY_DRAIN_TIMEOUT_SECONDS } " " ${ WORKER_NODES [@] } " || true
planned_shutdown
}
startup_flow( ) {
if [ [ " ${ RECOVERY_PENDING } " -eq 1 ] ] ; then
if ! ensure_minimum_battery_for_bootstrap; then
if [ [ " ${ STARTUP_ATTEMPTED_DURING_OUTAGE } " -eq 1 ] ] ; then
emergency_shutdown_after_outage
exit 1
fi
warn "Startup deferred due low battery after recent outage; marking for second-outage fallback."
save_recovery_state 1 1 deferred_low_battery
2026-04-06 00:22:54 -03:00
exit 1
fi
2026-04-06 04:47:05 -03:00
STARTUP_ATTEMPTED_DURING_OUTAGE = 1
save_recovery_state 1 1 waiting_for_api
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
if ! wait_for_api; then
die "Kubernetes API did not become reachable in time."
fi
mark_checkpoint startup_api_ready
2026-04-06 00:22:54 -03:00
2026-04-06 21:32:43 -03:00
ensure_harbor_target_node
ensure_harbor_host_label
mark_checkpoint startup_harbor_host_labeled
2026-04-07 12:30:28 -03:00
if [ [ -n " ${ FORCE_FLUX_URL } " ] ] ; then
warn " Breakglass: forcing Flux source URL to ' ${ FORCE_FLUX_URL } '. "
run kubectl -n flux-system patch gitrepository flux-system --type= merge -p " {\"spec\":{\"url\":\" ${ FORCE_FLUX_URL } \"}} "
mark_checkpoint startup_flux_url_forced
fi
2026-04-06 04:47:05 -03:00
if [ [ -n " ${ FORCE_FLUX_BRANCH } " ] ] ; then
run kubectl -n flux-system patch gitrepository flux-system --type= merge -p " {\"spec\":{\"ref\":{\"branch\":\" ${ FORCE_FLUX_BRANCH } \"}}} "
mark_checkpoint startup_flux_branch_forced
fi
2026-04-06 00:22:54 -03:00
2026-04-07 12:30:28 -03:00
assert_flux_source_expected
2026-04-06 04:47:05 -03:00
if [ [ " ${ SKIP_LOCAL_BOOTSTRAP } " -eq 0 ] ] ; then
if ! kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q True; then
warn "Flux source not Ready; executing local bootstrap fallback path."
bootstrap_local_minimal
mark_checkpoint startup_local_bootstrap_complete
check_ingress_stack
check_longhorn_stack
check_vault_stack
check_postgres_stack
check_gitea_stack
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
if [ [ " ${ SKIP_HARBOR_BOOTSTRAP } " -eq 0 ] ] ; then
if harbor_is_ready; then
log "Harbor already healthy; skipping Harbor seed/bootstrap."
else
if [ [ " ${ SKIP_HARBOR_SEED } " -eq 0 ] ] ; then
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
prewarm_node_helper_image
fi
seed_harbor_images
else
warn "Skipping Harbor seed/import by request."
fi
bootstrap_local_harbor
mark_checkpoint startup_local_harbor_applied
check_harbor_stack
check_harbor_endpoint
fi
else
warn "Skipping Harbor bootstrap fallback by request."
fi
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
else
warn "Skipping local bootstrap fallback by request."
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
resume_flux_and_reconcile
2026-04-07 12:30:28 -03:00
wait_for_flux_kustomizations_ready
restore_workload_replica_snapshot
wait_for_startup_workloads_ready
wait_for_startup_service_checklist
wait_for_startup_stability_window
2026-04-06 04:47:05 -03:00
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
prewarm_node_helper_image
mark_checkpoint startup_helper_prewarmed
fi
clear_recovery_state
log "Startup flow complete."
}
prepare_flow( ) {
[ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] || die " Harbor bundle missing at ${ HARBOR_BUNDLE_FILE } . Build and copy it to the canonical control host first. "
2026-04-06 21:32:43 -03:00
ensure_harbor_target_node
ensure_harbor_host_label
mark_checkpoint prepare_harbor_host_labeled
2026-04-06 04:47:05 -03:00
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
prewarm_node_helper_image
mark_checkpoint prepare_helper_prewarmed
fi
log "Prepare flow complete."
}
harbor_seed_flow( ) {
[ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] || die " Harbor bundle missing at ${ HARBOR_BUNDLE_FILE } . Build and copy it to the canonical control host first. "
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
prewarm_node_helper_image
mark_checkpoint harbor_seed_helper_prewarmed
fi
seed_harbor_images
check_harbor_endpoint
run_harbor_pull_canary
log "Harbor seed flow complete."
}
load_recovery_state
log " mode= ${ MODE } execute= ${ EXECUTE } "
2026-04-07 12:30:28 -03:00
log " shutdown-mode= ${ SHUTDOWN_MODE } "
2026-04-06 04:47:05 -03:00
log " recovery-state-file= ${ RECOVERY_STATE_FILE } "
log " bundle-file= ${ HARBOR_BUNDLE_FILE } "
log " node-helper-image= ${ NODE_HELPER_IMAGE } "
2026-04-06 21:27:23 -03:00
log " harbor-target-node-config= ${ HARBOR_TARGET_NODE :- auto } "
log " harbor-canary-node-config= ${ HARBOR_CANARY_NODE :- auto } "
2026-04-06 21:32:43 -03:00
log " harbor-host-label-key= ${ HARBOR_HOST_LABEL_KEY } "
2026-04-07 12:30:28 -03:00
log " expected-flux-url= ${ EXPECTED_FLUX_URL } "
log " expected-flux-branch= ${ EXPECTED_FLUX_BRANCH } "
log " startup-optional-kustomizations= ${ STARTUP_OPTIONAL_KUSTOMIZATIONS :- none } "
2026-04-06 04:47:05 -03:00
report_flux_source_state
case " ${ MODE } " in
status)
status_report
; ;
prepare)
prepare_flow
; ;
harbor-seed)
harbor_seed_flow
; ;
shutdown)
planned_shutdown
; ;
startup)
startup_flow
; ;
esac