2026-04-06 00:22:54 -03:00
#!/usr/bin/env bash
set -euo pipefail
2026-04-06 04:47:05 -03:00
SCRIPT_DIR = " $( cd " $( dirname " ${ BASH_SOURCE [0] } " ) " && pwd ) "
2026-04-06 21:27:23 -03:00
REPO_DIR = " ${ ANANKE_REPO_DIR :- $( cd " ${ SCRIPT_DIR } /.. " && pwd ) } "
2026-04-06 04:47:05 -03:00
BOOTSTRAP_DIR = " ${ SCRIPT_DIR } /bootstrap "
CONFIG_FILE = " ${ BOOTSTRAP_DIR } /recovery-config.env "
if [ [ -f " ${ CONFIG_FILE } " ] ] ; then
# shellcheck disable=SC1090
source " ${ CONFIG_FILE } "
fi
if [ [ -z " ${ KUBECONFIG :- } " && -f " ${ SCRIPT_DIR } /kubeconfig " ] ] ; then
export KUBECONFIG = " ${ SCRIPT_DIR } /kubeconfig "
fi
2026-04-06 00:22:54 -03:00
usage( ) {
2026-04-06 04:47:05 -03:00
cat <<USAGE
2026-04-06 00:22:54 -03:00
Usage:
2026-04-06 04:47:05 -03:00
scripts/cluster_power_recovery.sh <prepare| status| harbor-seed| shutdown| startup> [ options]
2026-04-06 00:22:54 -03:00
Options:
--execute Actually run commands ( default is dry-run)
2026-04-06 04:47:05 -03:00
--expected-flux-branch <name> Expected Flux source branch during startup checks ( default: ${ DEFAULT_FLUX_BRANCH :- main } )
--force-flux-branch <name> Startup: patch flux-system GitRepository branch to this value
--skip-etcd-snapshot Shutdown: skip etcd snapshot before shutdown
--skip-drain Shutdown: skip worker drain during shutdown
2026-04-06 00:22:54 -03:00
--skip-local-bootstrap Startup: skip local bootstrap fallback applies
--skip-harbor-bootstrap Startup: skip Harbor recovery bootstrap stage
2026-04-06 04:47:05 -03:00
--skip-harbor-seed Startup: skip Harbor image seed/import stage
--skip-helper-prewarm Prepare/Shutdown/Startup: skip node-helper prewarm
2026-04-06 00:22:54 -03:00
--min-startup-battery <pct> Minimum UPS percent required before bootstrap ( default: 35)
--ups-host <name> UPS identifier for upsc ( default: ups@localhost)
--ups-battery-key <key> UPS battery key for upsc ( default: battery.charge)
2026-04-06 04:47:05 -03:00
--recovery-state-file <path> Recovery state file for outage-aware restart logic
--harbor-bundle-file <path> Harbor bootstrap bundle on the control host
2026-04-06 21:27:23 -03:00
--harbor-target-node <name> Node that should host Harbor during bootstrap ( default: auto)
--harbor-canary-node <name> Node used for Harbor pull canary ( default: auto)
2026-04-06 04:47:05 -03:00
--harbor-canary-image <image> Harbor-backed image used for pull canary ( default: ${ HARBOR_CANARY_IMAGE :- registry .bstein.dev/bstein/kubectl : 1 .35.0 } )
2026-04-06 21:27:23 -03:00
--node-helper-image <image> Privileged helper image used for host operations ( default: ${ NODE_HELPER_IMAGE :- registry .bstein.dev/bstein/ananke-node-helper : 0 .1.0 } )
2026-04-06 04:47:05 -03:00
--bundle-http-port <port> Temporary HTTP port used to serve bootstrap bundles ( default: ${ BUNDLE_HTTP_PORT :- 8877 } )
--api-wait-timeout <seconds> Startup: Kubernetes API wait timeout ( default: 600)
2026-04-06 00:22:54 -03:00
--drain-timeout <seconds> Worker drain timeout for normal shutdown ( default: 180)
--emergency-drain-timeout <seconds>
Worker drain timeout for emergency fallback ( default: 45)
--require-ups-battery Hard-fail startup if UPS battery cannot be read
-h, --help Show help
Examples:
2026-04-06 04:47:05 -03:00
scripts/cluster_power_recovery.sh prepare --execute
scripts/cluster_power_recovery.sh harbor-seed --execute
scripts/cluster_power_recovery.sh status
2026-04-06 00:22:54 -03:00
scripts/cluster_power_recovery.sh shutdown --execute
scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main
USAGE
}
MODE = " ${ 1 :- } "
if [ [ -z " ${ MODE } " || " ${ MODE } " = = "-h" || " ${ MODE } " = = "--help" ] ] ; then
usage
exit 0
fi
shift || true
2026-04-06 04:47:05 -03:00
case " ${ MODE } " in
prepare| status| harbor-seed| shutdown| startup) ; ;
*)
echo " Unknown mode: ${ MODE } " >& 2
usage
exit 1
; ;
esac
2026-04-06 00:22:54 -03:00
EXECUTE = 0
2026-04-06 04:47:05 -03:00
EXPECTED_FLUX_BRANCH = " ${ DEFAULT_FLUX_BRANCH :- main } "
FORCE_FLUX_BRANCH = ""
2026-04-06 00:22:54 -03:00
SKIP_ETCD_SNAPSHOT = 0
SKIP_DRAIN = 0
SKIP_LOCAL_BOOTSTRAP = 0
SKIP_HARBOR_BOOTSTRAP = 0
2026-04-06 04:47:05 -03:00
SKIP_HARBOR_SEED = 0
SKIP_HELPER_PREWARM = 0
UPS_HOST = " ${ UPS_HOST :- ups @localhost } "
UPS_BATTERY_KEY = " ${ UPS_BATTERY_KEY :- battery .charge } "
MIN_STARTUP_BATTERY = " ${ MIN_STARTUP_BATTERY :- 35 } "
REQUIRE_UPS_BATTERY = " ${ REQUIRE_UPS_BATTERY :- 0 } "
2026-04-06 00:22:54 -03:00
DRAIN_TIMEOUT_SECONDS = 180
EMERGENCY_DRAIN_TIMEOUT_SECONDS = 45
2026-04-06 04:47:05 -03:00
API_WAIT_TIMEOUT_SECONDS = 600
BUNDLE_HTTP_PORT = " ${ BUNDLE_HTTP_PORT :- 8877 } "
2026-04-06 21:27:23 -03:00
STATE_ROOT = " ${ HOME } / ${ STATE_SUBDIR :- .local/share/ananke } "
2026-04-06 04:47:05 -03:00
RECOVERY_STATE_FILE = " ${ STATE_ROOT } /cluster_power_recovery.state "
HARBOR_BUNDLE_FILE = " ${ STATE_ROOT } /bundles/ ${ HARBOR_BUNDLE_BASENAME :- harbor -bootstrap-v2.14.1-arm64.tar.zst } "
2026-04-06 21:27:23 -03:00
HARBOR_TARGET_NODE = " ${ HARBOR_TARGET_NODE :- } "
HARBOR_CANARY_NODE = " ${ HARBOR_CANARY_NODE :- } "
2026-04-06 04:47:05 -03:00
HARBOR_CANARY_IMAGE = " ${ HARBOR_CANARY_IMAGE :- registry .bstein.dev/bstein/kubectl : 1 .35.0 } "
2026-04-06 21:27:23 -03:00
NODE_HELPER_IMAGE = " ${ NODE_HELPER_IMAGE :- registry .bstein.dev/bstein/ananke-node-helper : 0 .1.0 } "
2026-04-06 04:47:05 -03:00
NODE_HELPER_NAMESPACE = " ${ NODE_HELPER_NAMESPACE :- maintenance } "
NODE_HELPER_SERVICE_ACCOUNT = " ${ NODE_HELPER_SERVICE_ACCOUNT :- default } "
2026-04-06 21:27:23 -03:00
NODE_HELPER_PREWARM_DS = " ${ NODE_HELPER_PREWARM_DS :- ananke -node-helper-prewarm } "
2026-04-06 04:47:05 -03:00
REGISTRY_PULL_SECRET = " ${ REGISTRY_PULL_SECRET :- harbor -regcred } "
2026-04-06 21:27:23 -03:00
KEEP_PREWARM_DAEMONSET = 0
2026-04-06 00:22:54 -03:00
RECOVERY_PENDING = 0
STARTUP_ATTEMPTED_DURING_OUTAGE = 0
2026-04-06 04:47:05 -03:00
LAST_CHECKPOINT = "none"
BUNDLE_SERVER_PID = ""
UPS_HOST_IN_USE = ""
2026-04-06 00:22:54 -03:00
while [ [ $# -gt 0 ] ] ; do
case " $1 " in
--execute)
EXECUTE = 1
shift
; ;
2026-04-06 04:47:05 -03:00
--expected-flux-branch)
EXPECTED_FLUX_BRANCH = " ${ 2 : ?missing branch } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
2026-04-06 04:47:05 -03:00
--force-flux-branch)
FORCE_FLUX_BRANCH = " ${ 2 : ?missing branch } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
--skip-etcd-snapshot)
SKIP_ETCD_SNAPSHOT = 1
shift
; ;
--skip-drain)
SKIP_DRAIN = 1
shift
; ;
--skip-local-bootstrap)
SKIP_LOCAL_BOOTSTRAP = 1
shift
; ;
--skip-harbor-bootstrap)
SKIP_HARBOR_BOOTSTRAP = 1
shift
; ;
2026-04-06 04:47:05 -03:00
--skip-harbor-seed)
SKIP_HARBOR_SEED = 1
shift
; ;
--skip-helper-prewarm)
SKIP_HELPER_PREWARM = 1
shift
2026-04-06 00:22:54 -03:00
; ;
--ups-host)
2026-04-06 04:47:05 -03:00
UPS_HOST = " ${ 2 : ?missing ups host } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
--ups-battery-key)
2026-04-06 04:47:05 -03:00
UPS_BATTERY_KEY = " ${ 2 : ?missing ups key } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
--min-startup-battery)
2026-04-06 04:47:05 -03:00
MIN_STARTUP_BATTERY = " ${ 2 : ?missing battery threshold } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
2026-04-06 04:47:05 -03:00
--require-ups-battery)
REQUIRE_UPS_BATTERY = 1
shift
; ;
2026-04-06 00:22:54 -03:00
--recovery-state-file)
2026-04-06 04:47:05 -03:00
RECOVERY_STATE_FILE = " ${ 2 : ?missing state file path } "
shift 2
; ;
--harbor-bundle-file)
HARBOR_BUNDLE_FILE = " ${ 2 : ?missing bundle file path } "
shift 2
; ;
--harbor-target-node)
HARBOR_TARGET_NODE = " ${ 2 : ?missing harbor target node } "
shift 2
; ;
2026-04-06 21:27:23 -03:00
--harbor-canary-node)
HARBOR_CANARY_NODE = " ${ 2 : ?missing harbor canary node } "
shift 2
; ;
2026-04-06 04:47:05 -03:00
--harbor-canary-image)
HARBOR_CANARY_IMAGE = " ${ 2 : ?missing canary image } "
shift 2
; ;
--node-helper-image)
NODE_HELPER_IMAGE = " ${ 2 : ?missing node helper image } "
shift 2
; ;
--bundle-http-port)
BUNDLE_HTTP_PORT = " ${ 2 : ?missing bundle http port } "
shift 2
; ;
--api-wait-timeout)
API_WAIT_TIMEOUT_SECONDS = " ${ 2 : ?missing api wait timeout } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
--drain-timeout)
2026-04-06 04:47:05 -03:00
DRAIN_TIMEOUT_SECONDS = " ${ 2 : ?missing drain timeout } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
--emergency-drain-timeout)
2026-04-06 04:47:05 -03:00
EMERGENCY_DRAIN_TIMEOUT_SECONDS = " ${ 2 : ?missing emergency drain timeout } "
2026-04-06 00:22:54 -03:00
shift 2
; ;
-h| --help)
usage
exit 0
; ;
*)
echo " Unknown option: $1 " >& 2
usage
exit 1
; ;
esac
done
require_cmd( ) {
local cmd = " $1 "
if ! command -v " ${ cmd } " >/dev/null 2>& 1; then
echo " Missing required command: ${ cmd } " >& 2
exit 1
fi
}
require_cmd kubectl
2026-04-06 04:47:05 -03:00
require_cmd bash
require_cmd base64
require_cmd curl
2026-04-06 00:22:54 -03:00
log( ) { echo " [cluster-power] $* " ; }
warn( ) { echo " [cluster-power][warn] $* " >& 2; }
2026-04-06 04:47:05 -03:00
die( ) { echo " [cluster-power][error] $* " >& 2; exit 1; }
2026-04-06 00:22:54 -03:00
run( ) {
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
log " EXEC: $* "
" $@ "
else
log " DRY-RUN: $* "
fi
}
run_shell( ) {
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
log " EXEC: $* "
bash -lc " $* "
else
log " DRY-RUN: $* "
fi
}
2026-04-06 04:47:05 -03:00
apply_kustomization( ) {
local path = " $1 "
local full_path = " ${ REPO_DIR } / ${ path } "
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
log " EXEC: kubectl kustomize ${ full_path } --load-restrictor=LoadRestrictionsNone | kubectl apply -f - "
kubectl kustomize " ${ full_path } " --load-restrictor= LoadRestrictionsNone | kubectl apply -f -
2026-04-06 00:22:54 -03:00
else
2026-04-06 04:47:05 -03:00
log " DRY-RUN: kubectl kustomize ${ full_path } --load-restrictor=LoadRestrictionsNone | kubectl apply -f - "
2026-04-06 00:22:54 -03:00
fi
}
2026-04-06 04:47:05 -03:00
sanitize_name( ) {
printf '%s' " $1 " | tr '[:upper:]' '[:lower:]' | tr -cs 'a-z0-9-' '-'
2026-04-06 00:22:54 -03:00
}
2026-04-06 04:47:05 -03:00
state_dir( ) {
dirname " ${ RECOVERY_STATE_FILE } "
}
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
load_recovery_state( ) {
RECOVERY_PENDING = 0
STARTUP_ATTEMPTED_DURING_OUTAGE = 0
LAST_CHECKPOINT = "none"
[ [ -f " ${ RECOVERY_STATE_FILE } " ] ] || return 0
2026-04-06 00:22:54 -03:00
while IFS = '=' read -r key value; do
case " ${ key } " in
2026-04-06 04:47:05 -03:00
recovery_pending) RECOVERY_PENDING = " ${ value } " ; ;
startup_attempted) STARTUP_ATTEMPTED_DURING_OUTAGE = " ${ value } " ; ;
last_checkpoint) LAST_CHECKPOINT = " ${ value } " ; ;
2026-04-06 00:22:54 -03:00
esac
done < " ${ RECOVERY_STATE_FILE } "
}
save_recovery_state( ) {
2026-04-06 04:47:05 -03:00
[ [ " ${ EXECUTE } " -eq 1 ] ] || return 0
mkdir -p " $( state_dir) "
cat > " ${ RECOVERY_STATE_FILE } " <<STATE
2026-04-06 00:22:54 -03:00
recovery_pending = ${ 1 }
startup_attempted = ${ 2 }
2026-04-06 04:47:05 -03:00
last_checkpoint = ${ 3 }
STATE
}
mark_checkpoint( ) {
LAST_CHECKPOINT = " $1 "
save_recovery_state " ${ RECOVERY_PENDING } " " ${ STARTUP_ATTEMPTED_DURING_OUTAGE } " " ${ LAST_CHECKPOINT } "
2026-04-06 00:22:54 -03:00
}
clear_recovery_state( ) {
2026-04-06 04:47:05 -03:00
[ [ " ${ EXECUTE } " -eq 1 ] ] || return 0
rm -f " ${ RECOVERY_STATE_FILE } " 2>/dev/null || true
LAST_CHECKPOINT = "none"
}
sanitize_battery_percent( ) {
local raw = " $1 "
raw = " ${ raw ##* : } "
raw = " ${ raw //[[ : space : ]]/ } "
raw = " ${ raw %%.* } "
[ [ " ${ raw } " = ~ ^[ 0-9] +$ ] ] || return 1
printf '%s' " ${ raw } "
}
candidate_ups_hosts( ) {
local candidate name
local -A seen = ( )
if [ [ -n " ${ UPS_HOST } " ] ] ; then
seen[ " ${ UPS_HOST } " ] = 1
echo " ${ UPS_HOST } "
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
while IFS = read -r name; do
[ [ -n " ${ name } " ] ] || continue
for candidate in " ${ name } @localhost " " ${ name } " ; do
[ [ -n " ${ seen [ ${ candidate } ]+x } " ] ] && continue
seen[ " ${ candidate } " ] = 1
echo " ${ candidate } "
done
done < <( upsc -l 2>/dev/null || true )
2026-04-06 00:22:54 -03:00
}
read_ups_battery( ) {
if ! command -v upsc >/dev/null 2>& 1; then
return 1
fi
2026-04-06 04:47:05 -03:00
local host raw parsed
while IFS = read -r host; do
raw = " $( upsc " ${ host } " " ${ UPS_BATTERY_KEY } " 2>/dev/null || true ) "
[ [ -n " ${ raw } " ] ] || continue
parsed = " $( sanitize_battery_percent " ${ raw } " || true ) "
[ [ -n " ${ parsed } " ] ] || continue
UPS_HOST_IN_USE = " ${ host } "
printf '%s' " ${ parsed } "
return 0
done < <( candidate_ups_hosts)
return 1
2026-04-06 00:22:54 -03:00
}
ensure_minimum_battery_for_bootstrap( ) {
local battery
battery = " $( read_ups_battery || true ) "
if [ [ -z " ${ battery } " ] ] ; then
if [ [ " ${ REQUIRE_UPS_BATTERY } " -eq 1 ] ] ; then
warn "Unable to read UPS battery status and --require-ups-battery is set."
return 1
fi
warn "Unable to read UPS battery status; continuing without hard battery gating."
return 0
fi
2026-04-06 04:47:05 -03:00
log " ups-battery= ${ battery } % host= ${ UPS_HOST_IN_USE :- ${ UPS_HOST } } "
2026-04-06 00:22:54 -03:00
if ( ( battery < MIN_STARTUP_BATTERY ) ) ; then
warn " UPS battery ${ battery } % below minimum startup threshold ${ MIN_STARTUP_BATTERY } %. "
return 1
fi
return 0
}
report_flux_source_state( ) {
local flux_url flux_branch
flux_url = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.url}' 2>/dev/null || true ) "
flux_branch = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.spec.ref.branch}' 2>/dev/null || true ) "
2026-04-06 04:47:05 -03:00
[ [ -n " ${ flux_url } " ] ] && log " flux-source-url= ${ flux_url } "
2026-04-06 00:22:54 -03:00
if [ [ -n " ${ flux_branch } " ] ] ; then
log " flux-source-branch= ${ flux_branch } "
if [ [ " ${ MODE } " = = "startup" && -z " ${ FORCE_FLUX_BRANCH } " && " ${ flux_branch } " != " ${ EXPECTED_FLUX_BRANCH } " ] ] ; then
2026-04-06 04:47:05 -03:00
warn " Flux source branch is ' ${ flux_branch } '. Expected ' ${ EXPECTED_FLUX_BRANCH } ' for canonical recovery. "
2026-04-06 00:22:54 -03:00
fi
fi
}
wait_for_api( ) {
2026-04-06 04:47:05 -03:00
local attempts = $(( API_WAIT_TIMEOUT_SECONDS / 5 ))
if ( ( attempts < 1 ) ) ; then
attempts = 1
fi
2026-04-06 00:22:54 -03:00
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: skipping live Kubernetes API wait"
return 0
fi
local i
for i in $( seq 1 " ${ attempts } " ) ; do
if kubectl version --request-timeout= 5s >/dev/null 2>& 1; then
return 0
fi
2026-04-06 04:47:05 -03:00
sleep 5
2026-04-06 00:22:54 -03:00
done
return 1
}
2026-04-06 04:47:05 -03:00
patch_flux_suspend_all( ) {
local value = " $1 "
local patch
patch = $( printf '{"spec":{"suspend":%s}}' " ${ value } " )
local ks_list hr_list
ks_list = " $( kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath = '{range .items[*]}{.metadata.name}{"\n"}{end}' || true ) "
hr_list = " $( kubectl get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath = '{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' || true ) "
while IFS = read -r k; do
[ [ -z " ${ k } " ] ] && continue
run kubectl -n flux-system patch kustomization " ${ k } " --type= merge -p " ${ patch } "
done <<< " ${ ks_list } "
while IFS = read -r hr; do
[ [ -z " ${ hr } " ] ] && continue
local ns = " ${ hr %%/* } "
local name = " ${ hr ##*/ } "
run kubectl -n " ${ ns } " patch helmrelease " ${ name } " --type= merge -p " ${ patch } "
done <<< " ${ hr_list } "
}
2026-04-06 00:22:54 -03:00
best_effort_scale_down_apps( ) {
local excludes = '^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$'
local ns_list
ns_list = " $( kubectl get ns -o jsonpath = '{range .items[*]}{.metadata.name}{"\n"}{end}' ) "
while IFS = read -r ns; do
[ [ -z " ${ ns } " ] ] && continue
if [ [ " ${ ns } " = ~ ${ excludes } ] ] ; then
continue
fi
run_shell " kubectl -n ${ ns } scale deployment --all --replicas=0 || true "
run_shell " kubectl -n ${ ns } scale statefulset --all --replicas=0 || true "
done <<< " ${ ns_list } "
}
2026-04-06 04:47:05 -03:00
discover_workers_csv( ) {
kubectl get nodes \
-o custom-columns= NAME:.metadata.name,CP:.metadata.labels.node-role\\ .kubernetes\\ .io/control-plane,MASTER:.metadata.labels.node-role\\ .kubernetes\\ .io/master \
--no-headers \
| awk '$2=="<none>" && $3=="<none>" {print $1}' \
| paste -sd, -
}
2026-04-06 21:27:23 -03:00
node_is_ready( ) {
local node = " $1 "
[ [ -n " ${ node } " ] ] || return 1
local ready
ready = " $( kubectl get node " ${ node } " -o jsonpath = '{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || true ) "
[ [ " ${ ready } " = = "True" ] ]
}
select_ready_arm64_worker( ) {
local rows node
rows = " $( kubectl get nodes -o 'custom-columns=NAME:.metadata.name,ARCH:.metadata.labels.kubernetes\.io/arch,WORKER:.metadata.labels.node-role\.kubernetes\.io/worker,HARDWARE:.metadata.labels.hardware,READY:.status.conditions[?(@.type=="Ready")].status' --no-headers 2>/dev/null || true ) "
[ [ -n " ${ rows } " ] ] || return 1
node = " $( printf '%s\n' " ${ rows } " | awk '$2=="arm64" && $3=="true" && $4=="rpi5" && $5=="True" {print $1; exit}' ) "
if [ [ -n " ${ node } " ] ] ; then
printf '%s' " ${ node } "
return 0
fi
node = " $( printf '%s\n' " ${ rows } " | awk '$2=="arm64" && $3=="true" && $4=="rpi4" && $5=="True" {print $1; exit}' ) "
if [ [ -n " ${ node } " ] ] ; then
printf '%s' " ${ node } "
return 0
fi
node = " $( printf '%s\n' " ${ rows } " | awk '$2=="arm64" && $3=="true" && $5=="True" {print $1; exit}' ) "
if [ [ -n " ${ node } " ] ] ; then
printf '%s' " ${ node } "
return 0
fi
return 1
}
ensure_harbor_target_node( ) {
if node_is_ready " ${ HARBOR_TARGET_NODE } " ; then
return 0
fi
local fallback
fallback = " $( select_ready_arm64_worker || true ) "
[ [ -n " ${ fallback } " ] ] || die "No Ready arm64 worker available for Harbor bootstrap target."
if [ [ -n " ${ HARBOR_TARGET_NODE } " ] ] ; then
warn " Configured harbor target node ' ${ HARBOR_TARGET_NODE } ' is not Ready; using ' ${ fallback } ' instead. "
else
log " harbor-target-node auto-selected: ${ fallback } "
fi
HARBOR_TARGET_NODE = " ${ fallback } "
}
2026-04-06 04:47:05 -03:00
as_array_from_csv( ) {
local csv = " $1 "
local out_var = " $2 "
local old_ifs = " ${ IFS } "
IFS = ',' read -r -a _tmp <<< " ${ csv } "
IFS = " ${ old_ifs } "
eval " ${ out_var } " '=( "${_tmp[@]}" )'
}
2026-04-06 00:22:54 -03:00
best_effort_drain_workers( ) {
local timeout_seconds = " $1 "
shift || true
local workers = ( " $@ " )
local node
for node in " ${ workers [@] } " ; do
[ [ -z " ${ node } " ] ] && continue
run kubectl cordon " ${ node } "
if run_shell " kubectl drain ${ node } --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout= ${ timeout_seconds } s " ; then
continue
fi
warn " Gentle drain timed out for ${ node } ; retrying with --force. "
if run_shell " kubectl drain ${ node } --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout= ${ timeout_seconds } s --force " ; then
continue
fi
warn " Force drain timed out for ${ node } ; final attempt with --disable-eviction. "
run_shell " kubectl drain ${ node } --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout= ${ timeout_seconds } s --force --disable-eviction || true "
done
}
2026-04-06 04:47:05 -03:00
wait_for_rollout( ) {
local namespace = " $1 "
local kind = " $2 "
local name = " $3 "
local timeout = " $4 "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: kubectl -n ${ namespace } rollout status ${ kind } / ${ name } --timeout= ${ timeout } "
return 0
fi
kubectl -n " ${ namespace } " rollout status " ${ kind } / ${ name } " --timeout= " ${ timeout } "
2026-04-06 00:22:54 -03:00
}
2026-04-06 04:47:05 -03:00
check_ingress_stack( ) {
kubectl get ingressclass traefik >/dev/null
wait_for_rollout traefik deployment traefik 5m
}
check_longhorn_stack( ) {
wait_for_rollout longhorn-system daemonset longhorn-manager 10m
wait_for_rollout longhorn-system deployment longhorn-ui 10m
}
check_vault_stack( ) {
wait_for_rollout vault statefulset vault 10m
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
kubectl -n vault exec vault-0 -- sh -ceu 'VAULT_ADDR=http://127.0.0.1:8200 vault status >/dev/null'
fi
}
check_postgres_stack( ) {
wait_for_rollout postgres statefulset postgres 10m
if [ [ " ${ EXECUTE } " -eq 1 ] ] ; then
kubectl -n postgres exec postgres-0 -c postgres -- sh -ceu 'pg_isready -h 127.0.0.1 -p 5432 >/dev/null'
fi
}
check_gitea_stack( ) {
wait_for_rollout gitea deployment gitea 10m
}
check_harbor_stack( ) {
wait_for_rollout harbor statefulset harbor-redis 10m
wait_for_rollout harbor deployment harbor-core 10m
wait_for_rollout harbor deployment harbor-jobservice 10m
wait_for_rollout harbor deployment harbor-portal 10m
wait_for_rollout harbor deployment harbor-registry 10m
}
check_harbor_endpoint( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log "DRY-RUN: curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/"
return 0
fi
local code
code = " $( curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true ) "
case " ${ code } " in
200| 401)
log " harbor-endpoint=http- ${ code } "
; ;
*)
die " Harbor endpoint check failed with HTTP ${ code :- unknown } "
; ;
esac
}
wait_for_pod_phase( ) {
local namespace = " $1 "
local pod = " $2 "
local expected_phase = " $3 "
local timeout_seconds = " $4 "
local start now phase
start = " $( date +%s) "
while true; do
phase = " $( kubectl -n " ${ namespace } " get pod " ${ pod } " -o jsonpath = '{.status.phase}' 2>/dev/null || true ) "
if [ [ " ${ phase } " = = " ${ expected_phase } " ] ] ; then
return 0
fi
if [ [ " ${ phase } " = = "Failed" ] ] ; then
return 1
fi
now = " $( date +%s) "
if ( ( now - start >= timeout_seconds ) ) ; then
return 1
fi
sleep 2
2026-04-06 00:22:54 -03:00
done
}
2026-04-06 04:47:05 -03:00
harbor_is_ready( ) {
kubectl -n harbor get deploy harbor-core harbor-jobservice harbor-portal harbor-registry >/dev/null 2>& 1 || return 1
local code
code = " $( curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true ) "
[ [ " ${ code } " = = "200" || " ${ code } " = = "401" ] ]
}
run_harbor_pull_canary( ) {
2026-04-06 21:27:23 -03:00
local pod = "ananke-harbor-canary"
local canary_node = " ${ HARBOR_CANARY_NODE } "
if ! node_is_ready " ${ canary_node } " ; then
ensure_harbor_target_node
canary_node = " ${ HARBOR_TARGET_NODE } "
if [ [ -n " ${ HARBOR_CANARY_NODE } " ] ] ; then
warn " Configured harbor canary node ' ${ HARBOR_CANARY_NODE } ' is not Ready; using ' ${ canary_node } '. "
fi
HARBOR_CANARY_NODE = " ${ canary_node } "
fi
2026-04-06 04:47:05 -03:00
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
2026-04-06 21:27:23 -03:00
log " DRY-RUN: create Harbor pull canary pod with ${ HARBOR_CANARY_IMAGE } on ${ canary_node } "
2026-04-06 04:47:05 -03:00
return 0
fi
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
cat <<CANARY | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: ${ pod }
namespace: ${ NODE_HELPER_NAMESPACE }
spec:
2026-04-06 21:27:23 -03:00
nodeName: ${ canary_node }
2026-04-06 04:47:05 -03:00
restartPolicy: Never
imagePullSecrets:
- name: ${ REGISTRY_PULL_SECRET }
tolerations:
- operator: Exists
containers:
- name: canary
image: ${ HARBOR_CANARY_IMAGE }
imagePullPolicy: Always
command: [ "sh" , "-ceu" , "echo harbor-canary-ok" ]
CANARY
if ! wait_for_pod_phase " ${ NODE_HELPER_NAMESPACE } " " ${ pod } " Succeeded 180; then
kubectl -n " ${ NODE_HELPER_NAMESPACE } " describe pod " ${ pod } " >& 2 || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " logs " ${ pod } " >& 2 || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
return 1
fi
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " logs " ${ pod } " || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
}
run_helper_pod( ) {
local node = " $1 "
local purpose = " $2 "
local timeout_seconds = " $3 "
local script_content = " $4 "
2026-04-06 21:27:23 -03:00
local pod = " ananke- $( sanitize_name " ${ purpose } " ) - $( date +%H%M%S) "
2026-04-06 04:47:05 -03:00
local encoded_script
encoded_script = " $( printf '%s' " ${ script_content } " | base64 -w0) "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: helper pod ${ pod } on ${ node } for ${ purpose } "
return 0
fi
cat <<POD | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: ${ pod }
namespace: ${ NODE_HELPER_NAMESPACE }
spec:
nodeName: ${ node }
restartPolicy: Never
serviceAccountName: ${ NODE_HELPER_SERVICE_ACCOUNT }
imagePullSecrets:
- name: ${ REGISTRY_PULL_SECRET }
hostNetwork: true
hostPID: true
tolerations:
- operator: Exists
containers:
- name: helper
image: ${ NODE_HELPER_IMAGE }
imagePullPolicy: IfNotPresent
securityContext:
privileged: true
command: [ "/bin/bash" , "-ceu" ]
args:
- |
2026-04-06 21:27:23 -03:00
printf '%s' '${encoded_script}' | base64 -d >/tmp/ananke-step.sh
chmod +x /tmp/ananke-step.sh
/tmp/ananke-step.sh
2026-04-06 04:47:05 -03:00
POD
if ! wait_for_pod_phase " ${ NODE_HELPER_NAMESPACE } " " ${ pod } " Succeeded " ${ timeout_seconds } " ; then
kubectl -n " ${ NODE_HELPER_NAMESPACE } " describe pod " ${ pod } " >& 2 || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " logs " ${ pod } " >& 2 || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
return 1
fi
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " logs " ${ pod } " || true
timeout 20 kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete pod " ${ pod } " --ignore-not-found --wait= false >/dev/null 2>& 1 || true
}
run_host_command_via_helper( ) {
local node = " $1 "
local purpose = " $2 "
local timeout_seconds = " $3 "
local host_command = " $4 "
local encoded_command
encoded_command = " $( printf '%s' " ${ host_command } " | base64 -w0) "
local script_content
script_content = $( cat <<SCRIPT
set -euo pipefail
HOST_COMMAND = " \$(printf '%s' ' ${ encoded_command } ' | base64 -d) "
nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu "\${HOST_COMMAND}"
SCRIPT
)
run_helper_pod " ${ node } " " ${ purpose } " " ${ timeout_seconds } " " ${ script_content } "
}
2026-04-06 21:27:23 -03:00
run_host_command_via_prewarm_pod( ) {
local node = " $1 "
local host_command = " $2 "
local pod encoded_command
pod = " $( kubectl -n " ${ NODE_HELPER_NAMESPACE } " get pods -l app = " ${ NODE_HELPER_PREWARM_DS } " --field-selector " spec.nodeName= ${ node } " -o jsonpath = '{.items[0].metadata.name}' 2>/dev/null || true ) "
if [ [ -z " ${ pod } " ] ] ; then
return 1
fi
encoded_command = " $( printf '%s' " ${ host_command } " | base64 -w0) "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: helper exec via ${ pod } on ${ node } "
return 0
fi
run kubectl -n " ${ NODE_HELPER_NAMESPACE } " exec " ${ pod } " -- /bin/bash -ceu " HOST_COMMAND=\$(printf '%s' ' ${ encoded_command } ' | base64 -d); nsenter --target 1 --mount --uts --ipc --net --pid /bin/sh -ceu \"\${HOST_COMMAND}\" "
}
2026-04-06 04:47:05 -03:00
schedule_host_shutdown_via_helper( ) {
local node = " $1 "
local service_name = " $2 "
local delay_seconds = " $3 "
local host_command
2026-04-06 21:27:23 -03:00
host_command = " /usr/bin/systemd-run --unit ananke-shutdown- ${ service_name } --on-active= ${ delay_seconds } s /bin/sh -lc '/usr/bin/systemctl stop ${ service_name } || true; /usr/bin/systemctl poweroff || true' "
if run_host_command_via_prewarm_pod " ${ node } " " ${ host_command } " ; then
return 0
fi
2026-04-06 04:47:05 -03:00
run_host_command_via_helper " ${ node } " " shutdown- ${ node } - ${ service_name } " 120 " ${ host_command } "
}
prewarm_node_helper_image( ) {
2026-04-06 21:27:23 -03:00
local name = " ${ NODE_HELPER_PREWARM_DS } "
2026-04-06 04:47:05 -03:00
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: prewarm ${ NODE_HELPER_IMAGE } via temporary DaemonSet "
return 0
fi
cat <<DS | kubectl apply -f -
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: ${ name }
namespace: ${ NODE_HELPER_NAMESPACE }
spec:
selector:
matchLabels:
app: ${ name }
template:
metadata:
labels:
app: ${ name }
spec:
imagePullSecrets:
- name: ${ REGISTRY_PULL_SECRET }
tolerations:
- operator: Exists
containers:
- name: helper
image: ${ NODE_HELPER_IMAGE }
imagePullPolicy: Always
command: [ "/bin/sh" , "-ceu" , "sleep 300" ]
DS
local i desired ready
for i in $( seq 1 90) ; do
desired = " $( kubectl -n " ${ NODE_HELPER_NAMESPACE } " get ds " ${ name } " -o jsonpath = '{.status.desiredNumberScheduled}' 2>/dev/null || echo 0) "
ready = " $( kubectl -n " ${ NODE_HELPER_NAMESPACE } " get ds " ${ name } " -o jsonpath = '{.status.numberReady}' 2>/dev/null || echo 0) "
[ [ -n " ${ desired } " ] ] || desired = 0
[ [ -n " ${ ready } " ] ] || ready = 0
if [ [ " ${ desired } " != "0" && " ${ desired } " = = " ${ ready } " ] ] ; then
log " node-helper-prewarm= ${ ready } / ${ desired } "
2026-04-06 21:27:23 -03:00
if [ [ " ${ KEEP_PREWARM_DAEMONSET } " -eq 0 ] ] ; then
kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete ds " ${ name } " --ignore-not-found >/dev/null 2>& 1 || true
else
log " Keeping ${ name } DaemonSet running for shutdown helper exec path. "
fi
2026-04-06 04:47:05 -03:00
return 0
fi
sleep 2
2026-04-06 00:22:54 -03:00
done
2026-04-06 04:47:05 -03:00
kubectl -n " ${ NODE_HELPER_NAMESPACE } " describe ds " ${ name } " >& 2 || true
kubectl -n " ${ NODE_HELPER_NAMESPACE } " get pods -l app = " ${ name } " >& 2 || true
kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete ds " ${ name } " --ignore-not-found >/dev/null 2>& 1 || true
die " Timed out prewarming node helper image ${ NODE_HELPER_IMAGE } "
2026-04-06 00:22:54 -03:00
}
2026-04-06 21:27:23 -03:00
cleanup_prewarm_daemonset( ) {
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: cleanup ${ NODE_HELPER_PREWARM_DS } DaemonSet "
return 0
fi
kubectl -n " ${ NODE_HELPER_NAMESPACE } " delete ds " ${ NODE_HELPER_PREWARM_DS } " --ignore-not-found >/dev/null 2>& 1 || true
}
2026-04-06 04:47:05 -03:00
start_bundle_server( ) {
[ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] || die " Harbor bundle not found at ${ HARBOR_BUNDLE_FILE } "
require_cmd python3
local bundle_dir bundle_name
bundle_dir = " $( dirname " ${ HARBOR_BUNDLE_FILE } " ) "
bundle_name = " $( basename " ${ HARBOR_BUNDLE_FILE } " ) "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " DRY-RUN: serve ${ bundle_name } from ${ bundle_dir } on port ${ BUNDLE_HTTP_PORT } "
return 0
fi
2026-04-06 21:27:23 -03:00
python3 -m http.server " ${ BUNDLE_HTTP_PORT } " --bind 0.0.0.0 --directory " ${ bundle_dir } " </dev/null >/tmp/ananke-bundle-server.log 2>& 1 &
2026-04-06 04:47:05 -03:00
BUNDLE_SERVER_PID = $!
for _ in $( seq 1 20) ; do
if curl -fsS " http://127.0.0.1: ${ BUNDLE_HTTP_PORT } / ${ bundle_name } " >/dev/null 2>& 1; then
return 0
fi
sleep 1
2026-04-06 00:22:54 -03:00
done
2026-04-06 21:27:23 -03:00
die "Temporary bundle server did not become ready; see /tmp/ananke-bundle-server.log"
2026-04-06 00:22:54 -03:00
}
2026-04-06 04:47:05 -03:00
stop_bundle_server( ) {
if [ [ -n " ${ BUNDLE_SERVER_PID } " ] ] ; then
kill " ${ BUNDLE_SERVER_PID } " >/dev/null 2>& 1 || true
for _ in $( seq 1 10) ; do
kill -0 " ${ BUNDLE_SERVER_PID } " >/dev/null 2>& 1 || break
sleep 1
done
BUNDLE_SERVER_PID = ""
fi
}
trap stop_bundle_server EXIT
control_host_ip( ) {
hostname -I | awk '{print $1}'
}
seed_harbor_images( ) {
local images_text control_ip bundle_name script_content seed_rc = 0
[ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] || die " Harbor bundle not found at ${ HARBOR_BUNDLE_FILE } "
2026-04-06 21:27:23 -03:00
ensure_harbor_target_node
2026-04-06 04:47:05 -03:00
images_text = " $( sed '/^[[:space:]]*#/d;/^[[:space:]]*$/d' " ${ BOOTSTRAP_DIR } /harbor-bootstrap-images.txt " ) "
[ [ -n " ${ images_text } " ] ] || die " No Harbor images listed in ${ BOOTSTRAP_DIR } /harbor-bootstrap-images.txt "
bundle_name = " $( basename " ${ HARBOR_BUNDLE_FILE } " ) "
start_bundle_server
control_ip = " $( control_host_ip) "
script_content = $( cat <<SCRIPT
set -euo pipefail
curl -fsSL " http:// ${ control_ip } : ${ BUNDLE_HTTP_PORT } / ${ bundle_name } " \
| zstd -dc \
| nsenter --target 1 --mount --uts --ipc --net --pid /usr/local/bin/k3s ctr images import -
while IFS = read -r image; do
[ [ -z "\${image}" ] ] && continue
nsenter --target 1 --mount --uts --ipc --net --pid /usr/local/bin/k3s ctr images ls | awk '{print \$1}' | grep -Fx "\${image}" >/dev/null
done <<'IMAGES'
${ images_text }
IMAGES
SCRIPT
)
run_helper_pod " ${ HARBOR_TARGET_NODE } " "harbor-seed" 900 " ${ script_content } " || seed_rc = $?
stop_bundle_server
[ [ " ${ seed_rc } " -eq 0 ] ] || return " ${ seed_rc } "
mark_checkpoint startup_harbor_seeded
2026-04-06 00:22:54 -03:00
}
bootstrap_local_minimal( ) {
2026-04-06 04:47:05 -03:00
apply_kustomization infrastructure/core
apply_kustomization infrastructure/sources/helm
apply_kustomization infrastructure/longhorn/core
apply_kustomization infrastructure/metallb
apply_kustomization infrastructure/traefik
apply_kustomization infrastructure/vault-csi
apply_kustomization infrastructure/vault-injector
apply_kustomization services/vault
apply_kustomization infrastructure/postgres
apply_kustomization services/gitea
2026-04-06 00:22:54 -03:00
}
bootstrap_local_harbor( ) {
2026-04-06 04:47:05 -03:00
apply_kustomization services/harbor
2026-04-06 00:22:54 -03:00
}
2026-04-06 04:47:05 -03:00
reconcile_stage( ) {
local stage_name = " $1 "
shift
if ! command -v flux >/dev/null 2>& 1; then
2026-04-06 00:22:54 -03:00
local now
now = " $( date --iso-8601= seconds) "
run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt= " ${ now } " --overwrite
2026-04-06 04:47:05 -03:00
return 0
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
local item
for item in " $@ " ; do
run flux reconcile kustomization " ${ item } " -n flux-system --with-source --timeout= 15m
done
mark_checkpoint " reconciled_ ${ stage_name } "
2026-04-06 00:22:54 -03:00
}
2026-04-06 04:47:05 -03:00
resume_flux_and_reconcile( ) {
patch_flux_suspend_all false
if command -v flux >/dev/null 2>& 1; then
run flux reconcile source git flux-system -n flux-system --timeout= 3m
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
reconcile_stage core core helm longhorn metallb traefik vault-csi vault-injector
check_ingress_stack
check_longhorn_stack
reconcile_stage stateful vault postgres gitea
check_vault_stack
check_postgres_stack
check_gitea_stack
reconcile_stage registry harbor
check_harbor_stack
check_harbor_endpoint
run_harbor_pull_canary
}
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
status_report( ) {
local battery flux_ready harbor_code workers
2026-04-06 21:27:23 -03:00
local effective_target effective_canary
2026-04-06 04:47:05 -03:00
battery = " $( read_ups_battery || true ) "
flux_ready = " $( kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true ) "
harbor_code = " $( curl -ksS -o /dev/null -w '%{http_code}' https://registry.bstein.dev/v2/ || true ) "
workers = " $( discover_workers_csv 2>/dev/null || true ) "
2026-04-06 21:27:23 -03:00
effective_target = " ${ HARBOR_TARGET_NODE } "
if ! node_is_ready " ${ effective_target } " ; then
effective_target = " $( select_ready_arm64_worker || true ) "
fi
effective_canary = " ${ HARBOR_CANARY_NODE } "
if ! node_is_ready " ${ effective_canary } " ; then
effective_canary = " ${ effective_target } "
fi
2026-04-06 04:47:05 -03:00
echo "mode=status"
echo " bundle_file= ${ HARBOR_BUNDLE_FILE } "
echo " bundle_present= $( [ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] && echo true || echo false ) "
echo " node_helper_image= ${ NODE_HELPER_IMAGE } "
2026-04-06 21:27:23 -03:00
echo " harbor_target_node= ${ effective_target :- unknown } "
echo " harbor_canary_node= ${ effective_canary :- unknown } "
2026-04-06 04:47:05 -03:00
echo " workers= ${ workers } "
echo " recovery_pending= ${ RECOVERY_PENDING } "
echo " startup_attempted= ${ STARTUP_ATTEMPTED_DURING_OUTAGE } "
echo " last_checkpoint= ${ LAST_CHECKPOINT } "
echo " ups_host= ${ UPS_HOST_IN_USE :- ${ UPS_HOST } } "
echo " ups_battery= ${ battery :- unknown } "
echo " flux_source_ready= ${ flux_ready :- unknown } "
echo " harbor_http= ${ harbor_code :- unknown } "
kubectl get ingressclass traefik >/dev/null 2>& 1 && echo "traefik_ingressclass=true" || echo "traefik_ingressclass=false"
kubectl -n traefik get deploy traefik >/dev/null 2>& 1 && echo "traefik_deploy=true" || echo "traefik_deploy=false"
kubectl -n longhorn-system get ds longhorn-manager >/dev/null 2>& 1 && echo "longhorn_manager=true" || echo "longhorn_manager=false"
kubectl -n vault get sts vault >/dev/null 2>& 1 && echo "vault_statefulset=true" || echo "vault_statefulset=false"
kubectl -n postgres get sts postgres >/dev/null 2>& 1 && echo "postgres_statefulset=true" || echo "postgres_statefulset=false"
kubectl -n gitea get deploy gitea >/dev/null 2>& 1 && echo "gitea_deploy=true" || echo "gitea_deploy=false"
kubectl -n harbor get deploy harbor-core >/dev/null 2>& 1 && echo "harbor_deploy=true" || echo "harbor_deploy=false"
}
planned_shutdown( ) {
local workers_csv
workers_csv = " $( discover_workers_csv 2>/dev/null || true ) "
as_array_from_csv " ${ workers_csv } " WORKER_NODES
as_array_from_csv "titan-0a,titan-0b,titan-0c" CONTROL_PLANE_NODES
RECOVERY_PENDING = 1
STARTUP_ATTEMPTED_DURING_OUTAGE = 0
save_recovery_state 1 0 shutdown_started
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
2026-04-06 21:27:23 -03:00
KEEP_PREWARM_DAEMONSET = 1
2026-04-06 04:47:05 -03:00
prewarm_node_helper_image
mark_checkpoint shutdown_helper_prewarmed
fi
2026-04-06 00:22:54 -03:00
if [ [ " ${ SKIP_ETCD_SNAPSHOT } " -eq 0 ] ] ; then
2026-04-06 04:47:05 -03:00
local ts
ts = " $( date +%Y%m%d-%H%M%S) "
run_host_command_via_helper " ${ CONTROL_PLANE_NODES [0] } " "etcd-snapshot" 300 " /usr/local/bin/k3s etcd-snapshot save --name pre-shutdown- ${ ts } "
mark_checkpoint shutdown_snapshot_complete
2026-04-06 00:22:54 -03:00
else
warn "Skipping etcd snapshot by request."
fi
patch_flux_suspend_all true
best_effort_scale_down_apps
2026-04-06 04:47:05 -03:00
mark_checkpoint shutdown_apps_scaled_down
2026-04-06 00:22:54 -03:00
if [ [ " ${ SKIP_DRAIN } " -eq 0 ] ] ; then
best_effort_drain_workers " ${ DRAIN_TIMEOUT_SECONDS } " " ${ WORKER_NODES [@] } "
2026-04-06 04:47:05 -03:00
mark_checkpoint shutdown_workers_drained
2026-04-06 00:22:54 -03:00
else
warn "Skipping worker drain by request."
fi
2026-04-06 04:47:05 -03:00
local node
for node in " ${ WORKER_NODES [@] } " ; do
[ [ -z " ${ node } " ] ] && continue
schedule_host_shutdown_via_helper " ${ node } " k3s-agent 20
done
mark_checkpoint shutdown_workers_scheduled
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
for node in " ${ CONTROL_PLANE_NODES [@] } " ; do
[ [ -z " ${ node } " ] ] && continue
schedule_host_shutdown_via_helper " ${ node } " k3s 45
done
2026-04-06 21:27:23 -03:00
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
cleanup_prewarm_daemonset
fi
2026-04-06 04:47:05 -03:00
mark_checkpoint shutdown_control_planes_scheduled
log "Shutdown actions scheduled on hosts."
}
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
emergency_shutdown_after_outage( ) {
warn "Entering outage-aware emergency shutdown path due insufficient startup budget."
patch_flux_suspend_all true || true
best_effort_scale_down_apps || true
local workers_csv
workers_csv = " $( discover_workers_csv 2>/dev/null || true ) "
as_array_from_csv " ${ workers_csv } " WORKER_NODES
best_effort_drain_workers " ${ EMERGENCY_DRAIN_TIMEOUT_SECONDS } " " ${ WORKER_NODES [@] } " || true
planned_shutdown
}
startup_flow( ) {
if [ [ " ${ RECOVERY_PENDING } " -eq 1 ] ] ; then
if ! ensure_minimum_battery_for_bootstrap; then
if [ [ " ${ STARTUP_ATTEMPTED_DURING_OUTAGE } " -eq 1 ] ] ; then
emergency_shutdown_after_outage
exit 1
fi
warn "Startup deferred due low battery after recent outage; marking for second-outage fallback."
save_recovery_state 1 1 deferred_low_battery
2026-04-06 00:22:54 -03:00
exit 1
fi
2026-04-06 04:47:05 -03:00
STARTUP_ATTEMPTED_DURING_OUTAGE = 1
save_recovery_state 1 1 waiting_for_api
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
if ! wait_for_api; then
die "Kubernetes API did not become reachable in time."
fi
mark_checkpoint startup_api_ready
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
if [ [ -n " ${ FORCE_FLUX_BRANCH } " ] ] ; then
run kubectl -n flux-system patch gitrepository flux-system --type= merge -p " {\"spec\":{\"ref\":{\"branch\":\" ${ FORCE_FLUX_BRANCH } \"}}} "
mark_checkpoint startup_flux_branch_forced
fi
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
if [ [ " ${ SKIP_LOCAL_BOOTSTRAP } " -eq 0 ] ] ; then
if ! kubectl -n flux-system get gitrepository flux-system -o jsonpath = '{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q True; then
warn "Flux source not Ready; executing local bootstrap fallback path."
bootstrap_local_minimal
mark_checkpoint startup_local_bootstrap_complete
check_ingress_stack
check_longhorn_stack
check_vault_stack
check_postgres_stack
check_gitea_stack
2026-04-06 00:22:54 -03:00
2026-04-06 04:47:05 -03:00
if [ [ " ${ SKIP_HARBOR_BOOTSTRAP } " -eq 0 ] ] ; then
if harbor_is_ready; then
log "Harbor already healthy; skipping Harbor seed/bootstrap."
else
if [ [ " ${ SKIP_HARBOR_SEED } " -eq 0 ] ] ; then
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
prewarm_node_helper_image
fi
seed_harbor_images
else
warn "Skipping Harbor seed/import by request."
fi
bootstrap_local_harbor
mark_checkpoint startup_local_harbor_applied
check_harbor_stack
check_harbor_endpoint
fi
else
warn "Skipping Harbor bootstrap fallback by request."
fi
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
else
warn "Skipping local bootstrap fallback by request."
2026-04-06 00:22:54 -03:00
fi
2026-04-06 04:47:05 -03:00
resume_flux_and_reconcile
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
prewarm_node_helper_image
mark_checkpoint startup_helper_prewarmed
fi
clear_recovery_state
log "Startup flow complete."
}
prepare_flow( ) {
[ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] || die " Harbor bundle missing at ${ HARBOR_BUNDLE_FILE } . Build and copy it to the canonical control host first. "
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
prewarm_node_helper_image
mark_checkpoint prepare_helper_prewarmed
fi
log "Prepare flow complete."
}
harbor_seed_flow( ) {
[ [ -f " ${ HARBOR_BUNDLE_FILE } " ] ] || die " Harbor bundle missing at ${ HARBOR_BUNDLE_FILE } . Build and copy it to the canonical control host first. "
if [ [ " ${ SKIP_HELPER_PREWARM } " -eq 0 ] ] ; then
prewarm_node_helper_image
mark_checkpoint harbor_seed_helper_prewarmed
fi
seed_harbor_images
check_harbor_endpoint
run_harbor_pull_canary
log "Harbor seed flow complete."
}
load_recovery_state
log " mode= ${ MODE } execute= ${ EXECUTE } "
log " recovery-state-file= ${ RECOVERY_STATE_FILE } "
log " bundle-file= ${ HARBOR_BUNDLE_FILE } "
log " node-helper-image= ${ NODE_HELPER_IMAGE } "
2026-04-06 21:27:23 -03:00
log " harbor-target-node-config= ${ HARBOR_TARGET_NODE :- auto } "
log " harbor-canary-node-config= ${ HARBOR_CANARY_NODE :- auto } "
2026-04-06 04:47:05 -03:00
report_flux_source_state
case " ${ MODE } " in
status)
status_report
; ;
prepare)
prepare_flow
; ;
harbor-seed)
harbor_seed_flow
; ;
shutdown)
planned_shutdown
; ;
startup)
startup_flow
; ;
esac