diff --git a/clusters/atlas/flux-system/applications/gitea/kustomization.yaml b/clusters/atlas/flux-system/applications/gitea/kustomization.yaml index dbf71797..4761e9d9 100644 --- a/clusters/atlas/flux-system/applications/gitea/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/gitea/kustomization.yaml @@ -14,6 +14,7 @@ spec: name: flux-system namespace: flux-system dependsOn: + - name: longhorn - name: vault - name: postgres wait: true diff --git a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml index 5eec32fc..615e8cd6 100644 --- a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml @@ -16,3 +16,6 @@ spec: wait: false dependsOn: - name: core + - name: longhorn + - name: vault + - name: postgres diff --git a/clusters/atlas/flux-system/applications/keycloak/kustomization.yaml b/clusters/atlas/flux-system/applications/keycloak/kustomization.yaml index 4c3a0bc8..0311f2e9 100644 --- a/clusters/atlas/flux-system/applications/keycloak/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/keycloak/kustomization.yaml @@ -13,6 +13,7 @@ spec: path: ./services/keycloak targetNamespace: sso dependsOn: + - name: longhorn - name: vault - name: postgres timeout: 2m diff --git a/clusters/atlas/flux-system/applications/vault/kustomization.yaml b/clusters/atlas/flux-system/applications/vault/kustomization.yaml index 2e68b000..e9ef2a2c 100644 --- a/clusters/atlas/flux-system/applications/vault/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/vault/kustomization.yaml @@ -15,4 +15,5 @@ spec: prune: true wait: true dependsOn: + - name: longhorn - name: helm diff --git a/clusters/atlas/flux-system/platform/postgres/kustomization.yaml b/clusters/atlas/flux-system/platform/postgres/kustomization.yaml index 59cfb10d..60046e5d 100644 --- a/clusters/atlas/flux-system/platform/postgres/kustomization.yaml +++ b/clusters/atlas/flux-system/platform/postgres/kustomization.yaml @@ -14,6 +14,7 @@ spec: name: flux-system targetNamespace: postgres dependsOn: + - name: longhorn - name: vault - name: vault-csi healthChecks: diff --git a/knowledge/runbooks/cluster-power-recovery.md b/knowledge/runbooks/cluster-power-recovery.md new file mode 100644 index 00000000..5dac1567 --- /dev/null +++ b/knowledge/runbooks/cluster-power-recovery.md @@ -0,0 +1,66 @@ +Atlas Cluster Power Recovery (Graceful Shutdown/Startup) + +Purpose +- Provide a safe operator flow for planned power events and cold-boot recovery. +- Avoid the Flux/Gitea bootstrap deadlock by using a local bootstrap fallback path. +- Refuse bootstrap when UPS charge is too low, and fall back to fast shutdown if a second outage hits mid-recovery. + +Bootstrapping risk to remember +- Flux source is Git over SSH to `scm.bstein.dev` (Gitea). +- Gitea itself is a Flux-managed workload and depends on storage + database. +- Harbor is also critical, but it is not part of the first recovery stage because Harbor currently serves its own runtime images. +- On cold boot, if Flux cannot fetch source before Gitea is up, reconciliation can stall. +- Recovery path: bring control plane and workers up, then locally apply minimal platform stack (`core -> helm -> longhorn -> metallb -> traefik -> vault-csi -> vault-injector -> vault -> postgres -> gitea`), then resume/reconcile Flux. Harbor is a later recovery stage after storage, Vault, Postgres, and Gitea are back. + +Script +- `scripts/cluster_power_recovery.sh` +- `scripts/cluster_power_console.sh` +- Modes: + - `shutdown` + - `startup` +- Default is dry-run. Add `--execute` to actually perform actions. + +Dry-run examples +- Shutdown preview: + - `scripts/cluster_power_recovery.sh shutdown --skip-etcd-snapshot --skip-drain` +- Startup preview: + - `scripts/cluster_power_recovery.sh startup` + +Execute examples +- Planned shutdown: + - `scripts/cluster_power_recovery.sh shutdown --execute` +- Planned startup (canonical branch): + - `scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main` + +Manual remote console examples +- From `titan-24` with a local checkout: + - `~/Development/titan-iac/scripts/cluster_power_console.sh shutdown --execute` + - `~/Development/titan-iac/scripts/cluster_power_console.sh startup --execute --force-flux-branch main` +- From `titan-db`, if the checkout is not present locally, the console wrapper can delegate to `titan-24`: + - `~/Development/titan-iac/scripts/cluster_power_console.sh --delegate-host titan-24 shutdown --execute` + - `~/Development/titan-iac/scripts/cluster_power_console.sh --delegate-host titan-24 startup --execute --force-flux-branch main` + +Useful options +- `--control-planes titan-0a,titan-0b,titan-0c` +- `--workers ` (otherwise the script tries API discovery first, then falls back to the static atlas worker inventory) +- `--expected-flux-branch main` +- `--force-flux-branch main` +- `--skip-local-bootstrap` (not recommended for cold-start recovery) +- `--skip-harbor-bootstrap` (skip the Harbor recovery stage if you know Harbor should stay deferred) +- `--min-startup-battery 35` +- `--ups-host ups@localhost` +- `--require-ups-battery` +- `--drain-timeout 180` +- `--emergency-drain-timeout 45` +- `--recovery-state-file ~/.local/state/cluster_power_recovery.state` + +Operational notes +- The flow suspends Flux Kustomizations/HelmReleases during shutdown to prevent churn. +- Worker drain is no longer best-effort only. The script now escalates from normal drain, to `--force`, to `--disable-eviction` once the configured timeout is exhausted. +- During startup, if Flux source is not `Ready`, local bootstrap fallback is applied first. +- Longhorn is reconciled before Vault/Postgres/Gitea so storage-backed services are not racing the volume layer. +- Harbor is reconciled after the first critical stateful services. Treat Harbor bootstrap as requiring either cached Harbor runtime images on the scheduled node or a separate bootstrap source for those images. +- The script persists outage state in `~/.local/state/cluster_power_recovery.state` by default. If startup is attempted during an outage window and power becomes unstable again, rerunning startup with insufficient UPS charge will flip into the emergency shutdown path instead of continuing to bootstrap. +- In dry-run mode, the script now skips the live API wait step so preview runs do not stall on an offline cluster. +- After bootstrap, Flux resources are resumed and reconciled. +- Keep this runbook aligned with `clusters/atlas/flux-system/gotk-sync.yaml`. diff --git a/scripts/cluster_power_console.sh b/scripts/cluster_power_console.sh new file mode 100755 index 00000000..cd71bdc3 --- /dev/null +++ b/scripts/cluster_power_console.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: + scripts/cluster_power_console.sh [--repo-dir ] [--delegate-host ] [recovery-script-options...] + +Purpose: + Friendly manual entrypoint for running cluster power recovery from a remote console. + If the repo checkout exists locally, run the recovery script here. + Otherwise, delegate to another host that has the repo checkout. + +Defaults: + --repo-dir $HOME/Development/titan-iac + --delegate-host titan-24 + +Examples: + scripts/cluster_power_console.sh shutdown --execute + scripts/cluster_power_console.sh startup --execute --force-flux-branch main + scripts/cluster_power_console.sh --delegate-host titan-24 shutdown --execute +USAGE +} + +REPO_DIR="${HOME}/Development/titan-iac" +DELEGATE_HOST="titan-24" + +while [[ $# -gt 0 ]]; do + case "$1" in + --repo-dir) + REPO_DIR="${2:-}" + shift 2 + ;; + --delegate-host) + DELEGATE_HOST="${2:-}" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + break + ;; + esac +done + +if [[ $# -lt 1 ]]; then + usage + exit 1 +fi + +LOCAL_SCRIPT="${REPO_DIR}/scripts/cluster_power_recovery.sh" + +if [[ -x "${LOCAL_SCRIPT}" ]] && command -v kubectl >/dev/null 2>&1; then + exec "${LOCAL_SCRIPT}" "$@" +fi + +if [[ -z "${DELEGATE_HOST}" ]]; then + echo "cluster-power-console: local repo checkout not found at ${REPO_DIR} and no delegate host configured" >&2 + exit 1 +fi + +quoted_repo_dir="$(printf '%q' "${REPO_DIR}")" +quoted_args="$(printf '%q ' "$@")" + +exec ssh -o BatchMode=yes -o ConnectTimeout=8 "${DELEGATE_HOST}" \ + "cd ${quoted_repo_dir} && ./scripts/cluster_power_recovery.sh ${quoted_args}" diff --git a/scripts/cluster_power_recovery.sh b/scripts/cluster_power_recovery.sh new file mode 100755 index 00000000..0b745e4c --- /dev/null +++ b/scripts/cluster_power_recovery.sh @@ -0,0 +1,561 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: + scripts/cluster_power_recovery.sh shutdown [options] + scripts/cluster_power_recovery.sh startup [options] + +Options: + --execute Actually run commands (default is dry-run) + --ssh-user SSH user for node commands (default: current SSH config user) + --control-planes Control plane hosts (default: titan-0a,titan-0b,titan-0c) + --workers Worker hosts (default: static atlas inventory, with API discovery when available) + --expected-flux-branch Expected Flux source branch during startup checks (default: main) + --skip-etcd-snapshot Skip etcd snapshot before shutdown + --skip-drain Skip worker drain during shutdown + --skip-local-bootstrap Startup: skip local bootstrap fallback applies + --skip-harbor-bootstrap Startup: skip Harbor recovery bootstrap stage + --force-flux-branch Startup: patch flux-system GitRepository branch to this value + --min-startup-battery Minimum UPS percent required before bootstrap (default: 35) + --ups-host UPS identifier for upsc (default: ups@localhost) + --ups-battery-key UPS battery key for upsc (default: battery.charge) + --recovery-state-file Recovery state file for second-outage detection + --drain-timeout Worker drain timeout for normal shutdown (default: 180) + --emergency-drain-timeout + Worker drain timeout for emergency fallback (default: 45) + --require-ups-battery Hard-fail startup if UPS battery cannot be read + -h, --help Show help + +Examples: + scripts/cluster_power_recovery.sh shutdown --execute + scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main +USAGE +} + +MODE="${1:-}" +if [[ -z "${MODE}" || "${MODE}" == "-h" || "${MODE}" == "--help" ]]; then + usage + exit 0 +fi +shift || true + +if [[ "${MODE}" != "shutdown" && "${MODE}" != "startup" ]]; then + echo "Unknown mode: ${MODE}" >&2 + usage + exit 1 +fi + +EXECUTE=0 +SSH_USER="" +CONTROL_PLANES="titan-0a,titan-0b,titan-0c" +WORKERS="" +DEFAULT_WORKERS="titan-04,titan-05,titan-06,titan-07,titan-08,titan-09,titan-10,titan-11,titan-12,titan-13,titan-14,titan-15,titan-17,titan-18,titan-19,titan-20,titan-21,titan-22,titan-24" +EXPECTED_FLUX_BRANCH="main" +SKIP_ETCD_SNAPSHOT=0 +SKIP_DRAIN=0 +SKIP_LOCAL_BOOTSTRAP=0 +SKIP_HARBOR_BOOTSTRAP=0 +FORCE_FLUX_BRANCH="" +UPS_HOST="ups@localhost" +UPS_BATTERY_KEY="battery.charge" +RECOVERY_STATE_FILE="${HOME}/.local/state/cluster_power_recovery.state" +MIN_STARTUP_BATTERY=35 +DRAIN_TIMEOUT_SECONDS=180 +EMERGENCY_DRAIN_TIMEOUT_SECONDS=45 +REQUIRE_UPS_BATTERY=0 + +RECOVERY_PENDING=0 +STARTUP_ATTEMPTED_DURING_OUTAGE=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --execute) + EXECUTE=1 + shift + ;; + --ssh-user) + SSH_USER="${2:-}" + shift 2 + ;; + --control-planes) + CONTROL_PLANES="${2:-}" + shift 2 + ;; + --workers) + WORKERS="${2:-}" + shift 2 + ;; + --expected-flux-branch) + EXPECTED_FLUX_BRANCH="${2:-}" + shift 2 + ;; + --skip-etcd-snapshot) + SKIP_ETCD_SNAPSHOT=1 + shift + ;; + --skip-drain) + SKIP_DRAIN=1 + shift + ;; + --skip-local-bootstrap) + SKIP_LOCAL_BOOTSTRAP=1 + shift + ;; + --skip-harbor-bootstrap) + SKIP_HARBOR_BOOTSTRAP=1 + shift + ;; + --force-flux-branch) + FORCE_FLUX_BRANCH="${2:-}" + shift 2 + ;; + --ups-host) + UPS_HOST="${2:-}" + shift 2 + ;; + --ups-battery-key) + UPS_BATTERY_KEY="${2:-}" + shift 2 + ;; + --min-startup-battery) + MIN_STARTUP_BATTERY="${2:-}" + shift 2 + ;; + --recovery-state-file) + RECOVERY_STATE_FILE="${2:-}" + shift 2 + ;; + --drain-timeout) + DRAIN_TIMEOUT_SECONDS="${2:-}" + shift 2 + ;; + --emergency-drain-timeout) + EMERGENCY_DRAIN_TIMEOUT_SECONDS="${2:-}" + shift 2 + ;; + --require-ups-battery) + REQUIRE_UPS_BATTERY=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +require_cmd() { + local cmd="$1" + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing required command: ${cmd}" >&2 + exit 1 + fi +} + +require_cmd kubectl +require_cmd ssh + +log() { echo "[cluster-power] $*"; } +warn() { echo "[cluster-power][warn] $*" >&2; } + +run() { + if [[ "${EXECUTE}" -eq 1 ]]; then + log "EXEC: $*" + "$@" + else + log "DRY-RUN: $*" + fi +} + +run_shell() { + if [[ "${EXECUTE}" -eq 1 ]]; then + log "EXEC: $*" + bash -lc "$*" + else + log "DRY-RUN: $*" + fi +} + +as_array_from_csv() { + local csv="$1" + local out_var="$2" + local old_ifs="${IFS}" + IFS=',' read -r -a _tmp <<< "${csv}" + IFS="${old_ifs}" + eval "${out_var}"'=( "${_tmp[@]}" )' +} + +ssh_target() { + local node="$1" + if [[ -n "${SSH_USER}" ]]; then + printf "%s@%s" "${SSH_USER}" "${node}" + else + printf "%s" "${node}" + fi +} + +discover_workers_csv() { + # Include every non-control-plane node by default (workers + accelerators). + kubectl get nodes \ + -o custom-columns=NAME:.metadata.name,CP:.metadata.labels.node-role\\.kubernetes\\.io/control-plane,MASTER:.metadata.labels.node-role\\.kubernetes\\.io/master \ + --no-headers \ + | awk '$2=="" && $3=="" {print $1}' \ + | paste -sd, - +} + +load_recovery_state() { + if [[ ! -f "${RECOVERY_STATE_FILE}" ]]; then + RECOVERY_PENDING=0 + STARTUP_ATTEMPTED_DURING_OUTAGE=0 + return 0 + fi + + while IFS='=' read -r key value; do + case "${key}" in + recovery_pending) + RECOVERY_PENDING="${value}" + ;; + startup_attempted) + STARTUP_ATTEMPTED_DURING_OUTAGE="${value}" + ;; + esac + done < "${RECOVERY_STATE_FILE}" +} + +save_recovery_state() { + mkdir -p "$(dirname "${RECOVERY_STATE_FILE}")" + cat > "${RECOVERY_STATE_FILE}" </dev/null 2>&1; then + return 1 + fi + local raw + raw="$(upsc "${UPS_HOST}" "${UPS_BATTERY_KEY}" 2>/dev/null || true)" + if [[ -z "${raw}" ]]; then + return 1 + fi + # battery.charge can include units/decimals in some setups; normalize. + raw="${raw%%.*}" + if ! [[ "${raw}" =~ ^[0-9]+$ ]]; then + return 1 + fi + echo "${raw}" +} + +ensure_minimum_battery_for_bootstrap() { + local battery + battery="$(read_ups_battery || true)" + if [[ -z "${battery}" ]]; then + if [[ "${REQUIRE_UPS_BATTERY}" -eq 1 ]]; then + warn "Unable to read UPS battery status and --require-ups-battery is set." + return 1 + fi + warn "Unable to read UPS battery status; continuing without hard battery gating." + return 0 + fi + + log "ups-battery=${battery}%" + if (( battery < MIN_STARTUP_BATTERY )); then + warn "UPS battery ${battery}% below minimum startup threshold ${MIN_STARTUP_BATTERY}%." + return 1 + fi + return 0 +} + +emergency_shutdown_after_outage() { + warn "Entering outage-aware emergency shutdown path due insufficient startup budget." + patch_flux_suspend_all true || true + best_effort_scale_down_apps + # Give the cluster one short chance to drain, then force progress. + best_effort_drain_workers "${EMERGENCY_DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}" + stop_workers_agents "${WORKER_NODES[@]}" + stop_control_planes "${CONTROL_PLANE_NODES[@]}" +} + +patch_flux_suspend_all() { + local value="$1" + local patch + patch=$(printf '{"spec":{"suspend":%s}}' "${value}") + + local ks_list hr_list + ks_list="$(kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' || true)" + hr_list="$(kubectl get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' || true)" + + while IFS= read -r k; do + [[ -z "${k}" ]] && continue + run kubectl -n flux-system patch kustomization "${k}" --type=merge -p "${patch}" + done <<< "${ks_list}" + + while IFS= read -r hr; do + [[ -z "${hr}" ]] && continue + local ns="${hr%%/*}" + local name="${hr##*/}" + run kubectl -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" + done <<< "${hr_list}" +} + +report_flux_source_state() { + local flux_url flux_branch + flux_url="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.url}' 2>/dev/null || true)" + flux_branch="$(kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.spec.ref.branch}' 2>/dev/null || true)" + + if [[ -n "${flux_url}" ]]; then + log "flux-source-url=${flux_url}" + fi + if [[ -n "${flux_branch}" ]]; then + log "flux-source-branch=${flux_branch}" + if [[ "${MODE}" == "startup" && -z "${FORCE_FLUX_BRANCH}" && "${flux_branch}" != "${EXPECTED_FLUX_BRANCH}" ]]; then + warn "Flux source branch is '${flux_branch}'. Expected '${EXPECTED_FLUX_BRANCH}' for canonical cold-start recovery. Use --force-flux-branch ${EXPECTED_FLUX_BRANCH} if needed." + fi + fi +} + +wait_for_api() { + local attempts="${1:-90}" + local sleep_s="${2:-2}" + if [[ "${EXECUTE}" -eq 0 ]]; then + log "DRY-RUN: skipping live Kubernetes API wait" + return 0 + fi + local i + for i in $(seq 1 "${attempts}"); do + if kubectl version --request-timeout=5s >/dev/null 2>&1; then + return 0 + fi + sleep "${sleep_s}" + done + return 1 +} + +best_effort_scale_down_apps() { + local excludes='^(kube-system|kube-public|kube-node-lease|flux-system|traefik|metallb-system|cert-manager|longhorn-system|vault|postgres|maintenance)$' + local ns_list + ns_list="$(kubectl get ns -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')" + while IFS= read -r ns; do + [[ -z "${ns}" ]] && continue + if [[ "${ns}" =~ ${excludes} ]]; then + continue + fi + run_shell "kubectl -n ${ns} scale deployment --all --replicas=0 || true" + run_shell "kubectl -n ${ns} scale statefulset --all --replicas=0 || true" + done <<< "${ns_list}" +} + +best_effort_drain_workers() { + local timeout_seconds="$1" + shift || true + local workers=("$@") + local node + for node in "${workers[@]}"; do + [[ -z "${node}" ]] && continue + run kubectl cordon "${node}" + if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s"; then + continue + fi + warn "Gentle drain timed out for ${node}; retrying with --force." + if run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force"; then + continue + fi + warn "Force drain timed out for ${node}; final attempt with --disable-eviction." + run_shell "kubectl drain ${node} --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=${timeout_seconds}s --force --disable-eviction || true" + done +} + +stop_workers_agents() { + local workers=("$@") + local node target + for node in "${workers[@]}"; do + [[ -z "${node}" ]] && continue + target="$(ssh_target "${node}")" + run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl stop k3s-agent || true" + done +} + +start_workers_agents() { + local workers=("$@") + local node target + for node in "${workers[@]}"; do + [[ -z "${node}" ]] && continue + target="$(ssh_target "${node}")" + run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl start k3s-agent || true" + done +} + +stop_control_planes() { + local cps=("$@") + local node target + for node in "${cps[@]}"; do + [[ -z "${node}" ]] && continue + target="$(ssh_target "${node}")" + run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl stop k3s || true" + done +} + +start_control_planes() { + local cps=("$@") + local node target + for node in "${cps[@]}"; do + [[ -z "${node}" ]] && continue + target="$(ssh_target "${node}")" + run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" "sudo systemctl start k3s || true" + done +} + +take_etcd_snapshot() { + local cp="$1" + local target + target="$(ssh_target "${cp}")" + local ts + ts="$(date +%Y%m%d-%H%M%S)" + run ssh -o BatchMode=yes -o ConnectTimeout=8 "${target}" \ + "sudo k3s etcd-snapshot save --name pre-shutdown-${ts}" +} + +bootstrap_local_minimal() { + # Local apply path to break Flux<->Gitea boot deadlock during cold-start recovery. + # Longhorn is applied before stateful workloads so astreae-backed PVCs can bind. + run kubectl apply -k infrastructure/core + run kubectl apply -k infrastructure/sources/helm + run kubectl apply -k infrastructure/longhorn/core + run kubectl apply -k infrastructure/metallb + run kubectl apply -k infrastructure/traefik + run kubectl apply -k infrastructure/vault-csi + run kubectl apply -k infrastructure/vault-injector + run kubectl apply -k services/vault + run kubectl apply -k infrastructure/postgres + run kubectl apply -k services/gitea +} + +bootstrap_local_harbor() { + # Optional Harbor bootstrap stage for environments where Harbor is authoritative for images. + run kubectl apply -k services/harbor +} + +resume_flux_and_reconcile() { + patch_flux_suspend_all false + + if command -v flux >/dev/null 2>&1; then + run flux reconcile source git flux-system -n flux-system --timeout=3m + run flux reconcile kustomization core -n flux-system --with-source --timeout=5m + run flux reconcile kustomization helm -n flux-system --with-source --timeout=5m + run flux reconcile kustomization longhorn -n flux-system --with-source --timeout=15m + run flux reconcile kustomization metallb -n flux-system --with-source --timeout=5m + run flux reconcile kustomization traefik -n flux-system --with-source --timeout=5m + run flux reconcile kustomization vault-csi -n flux-system --with-source --timeout=5m + run flux reconcile kustomization vault-injector -n flux-system --with-source --timeout=5m + run flux reconcile kustomization vault -n flux-system --with-source --timeout=10m + run flux reconcile kustomization postgres -n flux-system --with-source --timeout=10m + run flux reconcile kustomization gitea -n flux-system --with-source --timeout=10m + run flux reconcile kustomization harbor -n flux-system --with-source --timeout=15m + else + local now + now="$(date --iso-8601=seconds)" + run kubectl -n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="${now}" --overwrite + fi +} + +as_array_from_csv "${CONTROL_PLANES}" CONTROL_PLANE_NODES +if [[ -z "${WORKERS}" ]]; then + WORKERS="$(discover_workers_csv 2>/dev/null || true)" + if [[ -z "${WORKERS}" ]]; then + warn "Unable to auto-discover workers from the API; falling back to static atlas worker inventory." + WORKERS="${DEFAULT_WORKERS}" + fi +fi +as_array_from_csv "${WORKERS}" WORKER_NODES +load_recovery_state + +log "mode=${MODE} execute=${EXECUTE}" +log "control-planes=${CONTROL_PLANES}" +log "workers=${WORKERS}" +log "recovery-state-file=${RECOVERY_STATE_FILE}" +log "recovery_pending=${RECOVERY_PENDING} startup_attempted=${STARTUP_ATTEMPTED_DURING_OUTAGE}" +report_flux_source_state + +if [[ "${MODE}" == "shutdown" ]]; then + save_recovery_state 1 0 + if [[ "${SKIP_ETCD_SNAPSHOT}" -eq 0 ]]; then + take_etcd_snapshot "${CONTROL_PLANE_NODES[0]}" + else + warn "Skipping etcd snapshot by request." + fi + + patch_flux_suspend_all true + best_effort_scale_down_apps + + if [[ "${SKIP_DRAIN}" -eq 0 ]]; then + best_effort_drain_workers "${DRAIN_TIMEOUT_SECONDS}" "${WORKER_NODES[@]}" + else + warn "Skipping worker drain by request." + fi + + stop_workers_agents "${WORKER_NODES[@]}" + stop_control_planes "${CONTROL_PLANE_NODES[@]}" + + log "Shutdown flow complete." + exit 0 +fi + +# Startup mode +if [[ "${RECOVERY_PENDING}" -eq 1 ]]; then + if ! ensure_minimum_battery_for_bootstrap; then + if [[ "${STARTUP_ATTEMPTED_DURING_OUTAGE}" -eq 1 ]]; then + emergency_shutdown_after_outage + exit 1 + fi + warn "Startup deferred due low battery after recent outage; marking for second-outage fallback." + save_recovery_state 1 1 + exit 1 + fi + save_recovery_state 1 1 +fi + +start_control_planes "${CONTROL_PLANE_NODES[@]}" +start_workers_agents "${WORKER_NODES[@]}" + +if ! wait_for_api 120 2; then + warn "Kubernetes API did not become reachable in time." + exit 1 +fi + +if [[ -n "${FORCE_FLUX_BRANCH}" ]]; then + run kubectl -n flux-system patch gitrepository flux-system --type=merge \ + -p "{\"spec\":{\"ref\":{\"branch\":\"${FORCE_FLUX_BRANCH}\"}}}" +fi + +if [[ "${SKIP_LOCAL_BOOTSTRAP}" -eq 0 ]]; then + # If source is not ready, bootstrap critical pieces from local checkout first. + if ! kubectl -n flux-system get gitrepository flux-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q True; then + warn "Flux source not Ready; executing local bootstrap fallback path." + bootstrap_local_minimal + if [[ "${SKIP_HARBOR_BOOTSTRAP}" -eq 0 ]]; then + bootstrap_local_harbor + else + warn "Skipping Harbor bootstrap fallback by request." + fi + fi +else + warn "Skipping local bootstrap fallback by request." +fi + +resume_flux_and_reconcile +clear_recovery_state +log "Startup flow complete." diff --git a/services/harbor/helmrelease.yaml b/services/harbor/helmrelease.yaml index f1bfc176..cd4106ac 100644 --- a/services/harbor/helmrelease.yaml +++ b/services/harbor/helmrelease.yaml @@ -76,8 +76,8 @@ spec: type: internal internal: image: - repository: registry.bstein.dev/infra/harbor-redis - tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"} + repository: goharbor/redis-photon + tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-redis:tag"} nodeSelector: kubernetes.io/hostname: titan-05 affinity: @@ -112,8 +112,8 @@ spec: existingSecretSecretKey: harbor-core core: image: - repository: registry.bstein.dev/infra/harbor-core - tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"} + repository: goharbor/harbor-core + tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-core:tag"} nodeSelector: kubernetes.io/hostname: titan-05 serviceAccountName: harbor-vault-sync @@ -173,8 +173,8 @@ spec: values: ["rpi4"] jobservice: image: - repository: registry.bstein.dev/infra/harbor-jobservice - tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"} + repository: goharbor/harbor-jobservice + tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-jobservice:tag"} nodeSelector: kubernetes.io/hostname: titan-05 serviceAccountName: harbor-vault-sync @@ -215,8 +215,8 @@ spec: values: ["rpi4"] portal: image: - repository: registry.bstein.dev/infra/harbor-portal - tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"} + repository: goharbor/harbor-portal + tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-portal:tag"} nodeSelector: kubernetes.io/hostname: titan-05 affinity: @@ -243,8 +243,8 @@ spec: registry: registry: image: - repository: registry.bstein.dev/infra/harbor-registry - tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"} + repository: goharbor/registry-photon + tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-registry:tag"} extraEnvVars: - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME value: harbor-core @@ -258,8 +258,8 @@ spec: value: 1s controller: image: - repository: registry.bstein.dev/infra/harbor-registryctl - tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registryctl:tag"} + repository: goharbor/harbor-registryctl + tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-registryctl:tag"} serviceAccountName: harbor-vault-sync automountServiceAccountToken: true existingSecret: harbor-registry @@ -320,8 +320,8 @@ spec: values: ["rpi4"] nginx: image: - repository: registry.bstein.dev/infra/harbor-nginx - tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"} + repository: goharbor/nginx-photon + tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-nginx:tag"} nodeSelector: kubernetes.io/hostname: titan-05 affinity: @@ -347,8 +347,8 @@ spec: values: ["rpi4"] prepare: image: - repository: registry.bstein.dev/infra/harbor-prepare - tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-prepare:tag"} + repository: goharbor/prepare + tag: v2.14.1 # {"$imagepolicy": "harbor:harbor-prepare:tag"} updateStrategy: type: Recreate postRenderers: diff --git a/services/harbor/image.yaml b/services/harbor/image.yaml index 2b258757..9b026b5a 100644 --- a/services/harbor/image.yaml +++ b/services/harbor/image.yaml @@ -5,7 +5,7 @@ metadata: name: harbor-core namespace: harbor spec: - image: registry.bstein.dev/infra/harbor-core + image: goharbor/harbor-core interval: 5m0s --- apiVersion: image.toolkit.fluxcd.io/v1beta2 @@ -17,11 +17,11 @@ spec: imageRepositoryRef: name: harbor-core filterTags: - pattern: '^v(?P\d+\.\d+\.\d+-arm64(\.\d+)?)$' + pattern: '^v(?P\d+\.\d+\.\d+)$' extract: '$version' policy: semver: - range: ">=2.14.0-0 <2.15.0-0" + range: ">=2.14.0 <2.15.0" --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImageRepository @@ -29,7 +29,7 @@ metadata: name: harbor-jobservice namespace: harbor spec: - image: registry.bstein.dev/infra/harbor-jobservice + image: goharbor/harbor-jobservice interval: 5m0s --- apiVersion: image.toolkit.fluxcd.io/v1beta2 @@ -41,11 +41,11 @@ spec: imageRepositoryRef: name: harbor-jobservice filterTags: - pattern: '^v(?P\d+\.\d+\.\d+-arm64(\.\d+)?)$' + pattern: '^v(?P\d+\.\d+\.\d+)$' extract: '$version' policy: semver: - range: ">=2.14.0-0 <2.15.0-0" + range: ">=2.14.0 <2.15.0" --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImageRepository @@ -53,7 +53,7 @@ metadata: name: harbor-portal namespace: harbor spec: - image: registry.bstein.dev/infra/harbor-portal + image: goharbor/harbor-portal interval: 5m0s --- apiVersion: image.toolkit.fluxcd.io/v1beta2 @@ -65,11 +65,11 @@ spec: imageRepositoryRef: name: harbor-portal filterTags: - pattern: '^v(?P\d+\.\d+\.\d+-arm64(\.\d+)?)$' + pattern: '^v(?P\d+\.\d+\.\d+)$' extract: '$version' policy: semver: - range: ">=2.14.0-0 <2.15.0-0" + range: ">=2.14.0 <2.15.0" --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImageRepository @@ -77,7 +77,7 @@ metadata: name: harbor-registry namespace: harbor spec: - image: registry.bstein.dev/infra/harbor-registry + image: goharbor/registry-photon interval: 5m0s --- apiVersion: image.toolkit.fluxcd.io/v1beta2 @@ -89,11 +89,11 @@ spec: imageRepositoryRef: name: harbor-registry filterTags: - pattern: '^v(?P\d+\.\d+\.\d+-arm64(\.\d+)?)$' + pattern: '^v(?P\d+\.\d+\.\d+)$' extract: '$version' policy: semver: - range: ">=2.14.0-0 <2.15.0-0" + range: ">=2.14.0 <2.15.0" --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImageRepository @@ -101,7 +101,7 @@ metadata: name: harbor-registryctl namespace: harbor spec: - image: registry.bstein.dev/infra/harbor-registryctl + image: goharbor/harbor-registryctl interval: 5m0s --- apiVersion: image.toolkit.fluxcd.io/v1beta2 @@ -113,11 +113,11 @@ spec: imageRepositoryRef: name: harbor-registryctl filterTags: - pattern: '^v(?P\d+\.\d+\.\d+-arm64(\.\d+)?)$' + pattern: '^v(?P\d+\.\d+\.\d+)$' extract: '$version' policy: semver: - range: ">=2.14.0-0 <2.15.0-0" + range: ">=2.14.0 <2.15.0" --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImageRepository @@ -125,7 +125,7 @@ metadata: name: harbor-redis namespace: harbor spec: - image: registry.bstein.dev/infra/harbor-redis + image: goharbor/redis-photon interval: 5m0s --- apiVersion: image.toolkit.fluxcd.io/v1beta2 @@ -137,11 +137,11 @@ spec: imageRepositoryRef: name: harbor-redis filterTags: - pattern: '^v(?P\d+\.\d+\.\d+-arm64(\.\d+)?)$' + pattern: '^v(?P\d+\.\d+\.\d+)$' extract: '$version' policy: semver: - range: ">=2.14.0-0 <2.15.0-0" + range: ">=2.14.0 <2.15.0" --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImageRepository @@ -149,7 +149,7 @@ metadata: name: harbor-nginx namespace: harbor spec: - image: registry.bstein.dev/infra/harbor-nginx + image: goharbor/nginx-photon interval: 5m0s --- apiVersion: image.toolkit.fluxcd.io/v1beta2 @@ -161,11 +161,11 @@ spec: imageRepositoryRef: name: harbor-nginx filterTags: - pattern: '^v(?P\d+\.\d+\.\d+-arm64(\.\d+)?)$' + pattern: '^v(?P\d+\.\d+\.\d+)$' extract: '$version' policy: semver: - range: ">=2.14.0-0 <2.15.0-0" + range: ">=2.14.0 <2.15.0" --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImageRepository @@ -173,7 +173,7 @@ metadata: name: harbor-prepare namespace: harbor spec: - image: registry.bstein.dev/infra/harbor-prepare + image: goharbor/prepare interval: 5m0s --- apiVersion: image.toolkit.fluxcd.io/v1beta2 @@ -185,8 +185,8 @@ spec: imageRepositoryRef: name: harbor-prepare filterTags: - pattern: '^v(?P\d+\.\d+\.\d+-arm64(\.\d+)?)$' + pattern: '^v(?P\d+\.\d+\.\d+)$' extract: '$version' policy: semver: - range: ">=2.14.0-0 <2.15.0-0" + range: ">=2.14.0 <2.15.0" diff --git a/services/pegasus/deployment.yaml b/services/pegasus/deployment.yaml index b6a1639e..c61dd1cf 100644 --- a/services/pegasus/deployment.yaml +++ b/services/pegasus/deployment.yaml @@ -73,7 +73,7 @@ spec: containers: - name: pegasus image: registry.bstein.dev/streaming/pegasus-vault:1.2.32 # {"$imagepolicy": "jellyfin:pegasus:tag"} - imagePullPolicy: Always + imagePullPolicy: IfNotPresent env: - name: PEGASUS_MEDIA_ROOT valueFrom: { configMapKeyRef: { name: pegasus-config, key: PEGASUS_MEDIA_ROOT } }