ananke/scripts/install.sh

963 lines
35 KiB
Bash
Raw Permalink Normal View History

#!/usr/bin/env bash
set -euo pipefail
if [[ "${EUID}" -ne 0 ]]; then
echo "Run as root: sudo ./scripts/install.sh" >&2
exit 1
fi
REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
BIN_DIR="/usr/local/bin"
CONF_DIR="/etc/ananke"
STATE_DIR="/var/lib/ananke"
SYSTEMD_DIR="/etc/systemd/system"
LIB_DIR="/usr/local/lib/ananke"
START_NOW=1
INSTALL_DEPS=1
ENABLE_BOOTSTRAP="${ANANKE_ENABLE_BOOTSTRAP:-auto}"
MANAGE_NUT="${ANANKE_MANAGE_NUT:-1}"
NUT_UPS_NAME="${ANANKE_NUT_UPS_NAME:-}"
NUT_VENDOR_ID="${ANANKE_NUT_VENDOR_ID:-0764}"
NUT_PRODUCT_ID="${ANANKE_NUT_PRODUCT_ID:-0601}"
NUT_MONITOR_USER="${ANANKE_NUT_MONITOR_USER:-monuser}"
NUT_MONITOR_PASSWORD="${ANANKE_NUT_MONITOR_PASSWORD:-anankeupsmon}"
FORCE_CONFIG_TEMPLATE="${ANANKE_FORCE_CONFIG_TEMPLATE:-}"
ENFORCE_QUALITY_GATE="${ANANKE_ENFORCE_QUALITY_GATE:-1}"
while [[ $# -gt 0 ]]; do
case "$1" in
--no-start)
START_NOW=0
shift
;;
--skip-deps)
INSTALL_DEPS=0
shift
;;
*)
echo "Unknown argument: $1" >&2
exit 1
;;
esac
done
resolve_nut_ups_name() {
if [[ -n "${NUT_UPS_NAME}" ]]; then
return 0
fi
if [[ -f "${CONF_DIR}/ananke.yaml" ]]; then
local target=""
target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/ananke.yaml" | head -n 1 | awk '{print $2}')"
if [[ -n "${target}" ]]; then
NUT_UPS_NAME="${target%@localhost}"
echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
return 0
fi
fi
NUT_UPS_NAME="pyrphoros"
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
}
read_ananke_role() {
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
echo "coordinator"
return 0
fi
local role
role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
if [[ -z "${role}" ]]; then
role="coordinator"
fi
echo "${role}"
}
migration_yaml_lookup() {
local key="$1"
awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
first_control_plane_name() {
awk '
/^control_planes:[[:space:]]*$/ {in_list=1; next}
in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
in_list && /^[^[:space:]]/ {in_list=0}
' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
lookup_node_host() {
local node="$1"
awk -F': *' -v n="${node}" '$1 == " " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
ensure_ananke_kubeconfig() {
local kubeconfig_path
kubeconfig_path="$(migration_yaml_lookup "kubeconfig")"
if [[ -z "${kubeconfig_path}" ]]; then
kubeconfig_path="/etc/ananke/kubeconfig"
fi
install -d -m 0750 "$(dirname "${kubeconfig_path}")"
if [[ -s "${kubeconfig_path}" ]] && KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
if [[ -r /etc/rancher/k3s/k3s.yaml ]]; then
install -m 0600 /etc/rancher/k3s/k3s.yaml "${kubeconfig_path}"
echo "[install] refreshed kubeconfig from local /etc/rancher/k3s/k3s.yaml"
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
fi
local cp_name cp_host ssh_user ssh_port ssh_cfg ssh_key
cp_name="$(first_control_plane_name)"
if [[ -z "${cp_name}" ]]; then
echo "[install] warning: cannot infer control plane name; kubeconfig bootstrap skipped"
return 0
fi
cp_host="$(lookup_node_host "${cp_name}")"
if [[ -z "${cp_host}" ]]; then
cp_host="${cp_name}"
fi
ssh_user="$(migration_yaml_lookup "ssh_user")"
ssh_port="$(migration_yaml_lookup "ssh_port")"
ssh_cfg="$(migration_yaml_lookup "ssh_config_file")"
ssh_key="$(migration_yaml_lookup "ssh_identity_file")"
if [[ -z "${ssh_port}" ]]; then
ssh_port="2277"
fi
local target
target="${cp_host}"
if [[ -n "${ssh_user}" ]]; then
target="${ssh_user}@${cp_host}"
fi
local ssh_args=(
-o BatchMode=yes
-o ConnectTimeout=8
-o StrictHostKeyChecking=accept-new
)
if [[ -n "${ssh_cfg}" && -f "${ssh_cfg}" ]]; then
ssh_args+=(-F "${ssh_cfg}")
fi
if [[ -n "${ssh_key}" && -f "${ssh_key}" ]]; then
ssh_args+=(-i "${ssh_key}")
fi
if [[ -n "${ssh_port}" ]]; then
ssh_args+=(-p "${ssh_port}")
fi
local remote_cfg
if remote_cfg="$(ssh "${ssh_args[@]}" "${target}" "sudo cat /etc/rancher/k3s/k3s.yaml" 2>/dev/null)"; then
printf '%s\n' "${remote_cfg}" > "${kubeconfig_path}"
sed -Ei "s#server:[[:space:]]*https://127\\.0\\.0\\.1:6443#server: https://${cp_host}:6443#g" "${kubeconfig_path}" || true
chmod 0600 "${kubeconfig_path}"
echo "[install] bootstrapped kubeconfig from control plane ${cp_name} (${cp_host})"
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
else
echo "[install] warning: failed to fetch kubeconfig from ${cp_name} (${cp_host})"
fi
echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
}
ensure_ananke_ssh_identity() {
local key_path key_dir key_user key_comment
key_path="$(migration_yaml_lookup "ssh_identity_file")"
if [[ -z "${key_path}" ]]; then
key_path="/home/atlas/.ssh/id_ed25519"
fi
key_dir="$(dirname "${key_path}")"
key_comment="ananke-$(hostname)-forward"
key_user="root"
if [[ "${key_path}" == /home/*/* ]]; then
key_user="${key_path#/home/}"
key_user="${key_user%%/*}"
fi
if ! id "${key_user}" >/dev/null 2>&1; then
echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
return 0
fi
install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
if [[ ! -s "${key_path}" ]]; then
echo "[install] generating missing SSH identity at ${key_path}"
if [[ "${key_user}" == "root" ]]; then
ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
else
runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
fi
fi
chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
chmod 0600 "${key_path}" || true
chmod 0644 "${key_path}.pub" || true
}
migrate_ananke_config() {
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
return 0
fi
local changed=0
local role_hint
role_hint="$(read_ananke_role)"
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
changed=1
fi
if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
changed=1
fi
if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/^ titan-24:[[:space:]]*tethys[[:space:]]*$/ titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated ssh_node_users titan-24 override to atlas"
changed=1
fi
if grep -Eq '^ command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ command_timeout_seconds:[[:space:]]*[0-9]+/a\ startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.startup_guard_max_age_seconds=900"
changed=1
fi
if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei \
-e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
-e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
"${CONF_DIR}/ananke.yaml"
echo "[install] removed deprecated host-poweroff shutdown config keys"
changed=1
fi
if grep -Eq '^ minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ minimum_battery_percent:[[:space:]]*[0-9.]+/a\ require_node_inventory_reachability: true\n node_inventory_reachability_wait_seconds: 300\n node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup node inventory reachability gate defaults"
changed=1
fi
if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
echo "[install] added state.reports_dir default"
changed=1
fi
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
local peer_host
peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
if [[ -n "${peer_host}" ]]; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
changed=1
fi
elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
changed=1
else
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts empty default"
changed=1
fi
fi
local default_restore_cp
default_restore_cp="$(first_control_plane_name)"
if [[ -z "${default_restore_cp}" ]]; then
default_restore_cp="titan-0a"
fi
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true\n auto_etcd_restore_on_api_failure: true\n etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
changed=1
fi
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup time sync + access reconciliation defaults"
changed=1
fi
if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup time sync quorum defaults"
changed=1
fi
if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup storage readiness defaults"
changed=1
fi
if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://scm.bstein.dev/api/healthz\n - https://metrics.bstein.dev/api/health\n require_service_checklist: true\n service_checklist_wait_seconds: 420\n service_checklist_poll_seconds: 5\n service_checklist_stability_seconds: 120\n service_checklist:\n - name: gitea-api\n url: https://scm.bstein.dev/api/healthz\n accepted_statuses: [200]\n body_contains: pass\n timeout_seconds: 12\n - name: grafana-api\n url: https://metrics.bstein.dev/api/health\n accepted_statuses: [200]\n body_contains: '\''\"database\":\"ok\"'\''\n timeout_seconds: 12\n vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup post-start probe + vault key fallback defaults"
changed=1
fi
if grep -Eq '^ - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
changed=1
fi
if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/api\/health$/a\ vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup.vault_unseal_key_file default"
changed=1
fi
fi
if ! grep -Eq '^ vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
if grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\ vault_unseal_breakglass_command: ""\n vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup break-glass fallback defaults"
changed=1
fi
fi
local role
role="$(read_ananke_role)"
local inventory_block
local managed_block
local workers_block
workers_block='workers:
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
if [[ "${role}" == "coordinator" ]]; then
inventory_block='ssh_node_hosts:
titan-db: 192.168.22.10
titan-0a: 192.168.22.11
titan-0b: 192.168.22.12
titan-0c: 192.168.22.13
titan-04: 192.168.22.30
titan-05: 192.168.22.31
titan-06: 192.168.22.32
titan-07: 192.168.22.33
titan-08: 192.168.22.34
titan-09: 192.168.22.35
titan-10: 192.168.22.36
titan-11: 192.168.22.37
titan-12: 192.168.22.40
titan-13: 192.168.22.41
titan-14: 192.168.22.42
titan-15: 192.168.22.43
titan-17: 192.168.22.45
titan-18: 192.168.22.46
titan-19: 192.168.22.47
titan-20: 192.168.22.20
titan-21: 192.168.22.21
titan-22: 192.168.22.22
titan-24: 192.168.22.26'
managed_block='ssh_managed_nodes:
- titan-db
- titan-0a
- titan-0b
- titan-0c
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
elif [[ "${role}" == "peer" ]]; then
inventory_block='ssh_node_hosts:
titan-db: 192.168.22.10
titan-0a: 192.168.22.11
titan-0b: 192.168.22.12
titan-0c: 192.168.22.13
titan-04: 192.168.22.30
titan-05: 192.168.22.31
titan-06: 192.168.22.32
titan-07: 192.168.22.33
titan-08: 192.168.22.34
titan-09: 192.168.22.35
titan-10: 192.168.22.36
titan-11: 192.168.22.37
titan-12: 192.168.22.40
titan-13: 192.168.22.41
titan-14: 192.168.22.42
titan-15: 192.168.22.43
titan-17: 192.168.22.45
titan-18: 192.168.22.46
titan-19: 192.168.22.47
titan-20: 192.168.22.20
titan-21: 192.168.22.21
titan-22: 192.168.22.22
titan-24: 192.168.22.26'
managed_block='ssh_managed_nodes:
- titan-db
- titan-0a
- titan-0b
- titan-0c
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
fi
if [[ -n "${inventory_block}" ]]; then
if grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
changed=1
fi
fi
if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated workers inventory for startup/shutdown orchestration"
changed=1
fi
if [[ -n "${managed_block}" ]]; then
if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
changed=1
fi
if ! grep -Eq '^ - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - titan-21$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n(?: - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
changed=1
fi
fi
if [[ "${role}" == "peer" ]]; then
if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ - titan-db$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ - titan-24$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-04\n - titan-05\n - titan-06\n - titan-07\n - titan-08\n - titan-09\n - titan-10\n - titan-11\n - titan-12\n - titan-13\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-19\n - titan-20\n - titan-21\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
changed=1
fi
if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
changed=1
fi
fi
if [[ "${changed}" -eq 1 ]]; then
chmod 0640 "${CONF_DIR}/ananke.yaml" || true
fi
}
sanitize_migrated_ananke_config() {
local cfg="${CONF_DIR}/ananke.yaml"
[[ -f "${cfg}" ]] || return 0
local tmp changed=0
tmp="$(mktemp)"
# Legacy migration bug guard:
# If root-level "- node" entries were accidentally appended after ssh_managed_nodes,
# drop those orphan entries until the next top-level key.
awk '
BEGIN {in_managed=0}
/^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
{
if (in_managed) {
if ($0 ~ /^ - /) {print; next}
if ($0 ~ /^- /) {next}
if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
}
print
}
' "${cfg}" > "${tmp}"
if ! cmp -s "${cfg}" "${tmp}"; then
mv "${tmp}" "${cfg}"
changed=1
echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
else
rm -f "${tmp}"
fi
if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
changed=1
echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
fi
if [[ "${changed}" -eq 1 ]]; then
chmod 0640 "${cfg}" || true
fi
}
ensure_apt_packages() {
local missing=()
for pkg in "$@"; do
if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
missing+=("${pkg}")
fi
done
if [[ ${#missing[@]} -eq 0 ]]; then
return 0
fi
echo "[install] apt install: ${missing[*]}"
export DEBIAN_FRONTEND=noninteractive
apt-get update -y
apt-get install -y "${missing[@]}"
}
install_kubectl_if_missing() {
if command -v kubectl >/dev/null 2>&1; then
return 0
fi
ensure_apt_packages kubernetes-client || true
if command -v kubectl >/dev/null 2>&1; then
return 0
fi
echo "[install] installing kubectl via upstream binary"
local arch
arch="$(uname -m)"
case "${arch}" in
x86_64) arch="amd64" ;;
aarch64|arm64) arch="arm64" ;;
*) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
esac
local version
version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
chmod 0755 /usr/local/bin/kubectl
}
ensure_dependencies() {
if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
echo "[install] skipping dependency installation"
return 0
fi
if ! command -v apt-get >/dev/null 2>&1; then
echo "This installer currently supports apt-based hosts only." >&2
exit 1
fi
ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
install_kubectl_if_missing
}
legacy_path_rewrite() {
local src="$1"
local dst="$2"
sed \
-e 's#/etc/hecate/hecate.yaml#/etc/ananke/ananke.yaml#g' \
-e 's#/etc/hecate/kubeconfig#/etc/ananke/kubeconfig#g' \
-e 's#/var/lib/hecate/vault-unseal.key#/var/lib/ananke/vault-unseal.key#g' \
-e 's#/var/lib/hecate/hecate.lock#/var/lib/ananke/ananke.lock#g' \
-e 's#/opt/hecate#/opt/ananke#g' \
-e 's#/etc/hecate#/etc/ananke#g' \
-e 's#/var/lib/hecate#/var/lib/ananke#g' \
-e 's#/usr/local/bin/hecate#/usr/local/bin/ananke#g' \
-e 's#/usr/local/lib/hecate#/usr/local/lib/ananke#g' \
-e 's/hecate.yaml/ananke.yaml/g' \
-e 's/hecate.lock/ananke.lock/g' \
-e 's/hecate/ananke/g' \
-e 's/Hecate/Ananke/g' \
-e 's#hecate\.lock#ananke.lock#g' \
"${src}" > "${dst}"
}
migrate_legacy_hecate_install() {
local legacy_conf_dir="/etc/hecate"
local legacy_state_dir="/var/lib/hecate"
local legacy_systemd_dir="/etc/systemd/system"
install -d -m 0750 "${CONF_DIR}"
install -d -m 0750 "${STATE_DIR}"
if [[ ! -f "${CONF_DIR}/ananke.yaml" && -f "${legacy_conf_dir}/hecate.yaml" ]]; then
echo "[install] migrating legacy config ${legacy_conf_dir}/hecate.yaml -> ${CONF_DIR}/ananke.yaml"
legacy_path_rewrite "${legacy_conf_dir}/hecate.yaml" "${CONF_DIR}/ananke.yaml"
chmod 0640 "${CONF_DIR}/ananke.yaml"
fi
if [[ ! -f "${CONF_DIR}/kubeconfig" && -f "${legacy_conf_dir}/kubeconfig" ]]; then
echo "[install] migrating legacy kubeconfig ${legacy_conf_dir}/kubeconfig -> ${CONF_DIR}/kubeconfig"
install -m 0600 "${legacy_conf_dir}/kubeconfig" "${CONF_DIR}/kubeconfig"
fi
if [[ ! -f "${STATE_DIR}/vault-unseal.key" && -f "${legacy_state_dir}/vault-unseal.key" ]]; then
echo "[install] migrating legacy vault key ${legacy_state_dir}/vault-unseal.key -> ${STATE_DIR}/vault-unseal.key"
install -m 0600 "${legacy_state_dir}/vault-unseal.key" "${STATE_DIR}/vault-unseal.key"
fi
if [[ ! -f "${STATE_DIR}/runs.json" && -f "${legacy_state_dir}/runs.json" ]]; then
echo "[install] migrating legacy run history ${legacy_state_dir}/runs.json -> ${STATE_DIR}/runs.json"
install -m 0640 "${legacy_state_dir}/runs.json" "${STATE_DIR}/runs.json"
fi
if [[ ! -f "${STATE_DIR}/intent.json" && -f "${legacy_state_dir}/intent.json" ]]; then
echo "[install] migrating legacy intent state ${legacy_state_dir}/intent.json -> ${STATE_DIR}/intent.json"
install -m 0640 "${legacy_state_dir}/intent.json" "${STATE_DIR}/intent.json"
fi
if [[ ! -f "${STATE_DIR}/ananke.lock" && -f "${legacy_state_dir}/hecate.lock" ]]; then
echo "[install] migrating legacy lock ${legacy_state_dir}/hecate.lock -> ${STATE_DIR}/ananke.lock"
install -m 0640 "${legacy_state_dir}/hecate.lock" "${STATE_DIR}/ananke.lock"
fi
if [[ -d "${legacy_systemd_dir}" ]]; then
if ls "${legacy_systemd_dir}"/hecate*.service >/dev/null 2>&1 || ls "${legacy_systemd_dir}"/hecate*.timer >/dev/null 2>&1; then
echo "[install] detected legacy hecate systemd unit files; will retire after ananke install"
fi
fi
}
retire_legacy_hecate_install() {
local ts backup_dir
ts="$(date +%Y%m%d%H%M%S)"
backup_dir="/var/backups/ananke-legacy-hecate-${ts}"
systemctl disable --now hecate.service hecate-bootstrap.service hecate-update.timer >/dev/null 2>&1 || true
systemctl stop hecate-update.service >/dev/null 2>&1 || true
if [[ -d /etc/hecate || -d /var/lib/hecate || -d /usr/local/lib/hecate || -d /opt/hecate ]]; then
install -d -m 0750 "${backup_dir}"
[[ -d /etc/hecate ]] && cp -a /etc/hecate "${backup_dir}/" || true
[[ -d /var/lib/hecate ]] && cp -a /var/lib/hecate "${backup_dir}/" || true
[[ -d /usr/local/lib/hecate ]] && cp -a /usr/local/lib/hecate "${backup_dir}/" || true
[[ -d /opt/hecate ]] && cp -a /opt/hecate "${backup_dir}/" || true
[[ -f /usr/local/bin/hecate ]] && install -m 0755 /usr/local/bin/hecate "${backup_dir}/hecate.bin" || true
echo "[install] backed up legacy hecate assets to ${backup_dir}"
fi
rm -f \
/etc/systemd/system/hecate.service \
/etc/systemd/system/hecate-bootstrap.service \
/etc/systemd/system/hecate-update.service \
/etc/systemd/system/hecate-update.timer
rm -f /usr/local/bin/hecate
rm -rf /usr/local/lib/hecate
rm -rf /opt/hecate
rm -rf /etc/hecate
rm -rf /var/lib/hecate
}
resolve_build_target() {
if [[ -d "${REPO_DIR}/cmd/ananke" ]]; then
echo "./cmd/ananke"
return 0
fi
return 1
}
install_config_template() {
local template="$1"
local dest="$2"
local src legacy
local -a modern_candidates=()
local -a legacy_candidates=()
case "${template}" in
coordinator)
modern_candidates=("configs/ananke.coordinator.yaml" "configs/ananke.titan-db.yaml")
legacy_candidates=("configs/hecate.titan-db.yaml")
;;
peer)
modern_candidates=("configs/ananke.peer.yaml" "configs/ananke.tethys.yaml")
legacy_candidates=("configs/hecate.tethys.yaml")
;;
example)
modern_candidates=("configs/ananke.example.yaml")
legacy_candidates=("configs/hecate.example.yaml")
;;
*)
echo "[install] unknown config template key: ${template}" >&2
return 1
;;
esac
for src in "${modern_candidates[@]}"; do
if [[ -f "${src}" ]]; then
install -m 0640 "${src}" "${dest}"
return 0
fi
done
for legacy in "${legacy_candidates[@]}"; do
if [[ -f "${legacy}" ]]; then
src="$(mktemp)"
legacy_path_rewrite "${legacy}" "${src}"
install -m 0640 "${src}" "${dest}"
rm -f "${src}"
return 0
fi
done
echo "[install] missing config template sources for '${template}'. modern=[${modern_candidates[*]}] legacy=[${legacy_candidates[*]}]" >&2
return 1
}
install_systemd_units() {
local source_map
local tmp
while IFS='|' read -r target_name modern_name legacy_name; do
local modern_src="deploy/systemd/${modern_name}"
local legacy_src="deploy/systemd/${legacy_name}"
local target="${SYSTEMD_DIR}/${target_name}"
if [[ -f "${modern_src}" ]]; then
install -m 0644 "${modern_src}" "${target}"
continue
fi
if [[ -f "${legacy_src}" ]]; then
tmp="$(mktemp)"
legacy_path_rewrite "${legacy_src}" "${tmp}"
install -m 0644 "${tmp}" "${target}"
rm -f "${tmp}"
continue
fi
echo "[install] missing both modern and legacy systemd unit sources for ${target_name}" >&2
return 1
done <<'EOF_UNITS'
ananke.service|ananke.service|hecate.service
ananke-bootstrap.service|ananke-bootstrap.service|hecate-bootstrap.service
ananke-update.service|ananke-update.service|hecate-update.service
ananke-update.timer|ananke-update.timer|hecate-update.timer
EOF_UNITS
}
install_self_update_script() {
local modern_src="scripts/ananke-self-update.sh"
local legacy_src="scripts/hecate-self-update.sh"
local target="${LIB_DIR}/ananke-self-update.sh"
local tmp
if [[ -f "${modern_src}" ]]; then
install -m 0755 "${modern_src}" "${target}"
return 0
fi
if [[ -f "${legacy_src}" ]]; then
tmp="$(mktemp)"
legacy_path_rewrite "${legacy_src}" "${tmp}"
sed -Ei \
-e 's/HECATE_/ANANKE_/g' \
-e 's/hecate-self-update/ananke-self-update/g' \
-e 's#/opt/hecate#/opt/ananke#g' \
-e 's#bstein/hecate\.git#bstein/ananke.git#g' \
"${tmp}"
install -m 0755 "${tmp}" "${target}"
rm -f "${tmp}"
return 0
fi
echo "[install] missing both modern and legacy self-update scripts." >&2
return 1
}
configure_nut() {
if [[ "${MANAGE_NUT}" != "1" ]]; then
echo "[install] skipping NUT configuration (ANANKE_MANAGE_NUT=${MANAGE_NUT})"
return 0
fi
echo "[install] configuring NUT + udev for UPS ${NUT_UPS_NAME} (${NUT_VENDOR_ID}:${NUT_PRODUCT_ID})"
install -d -m 0755 /etc/nut /etc/udev/rules.d
cat > /etc/nut/nut.conf <<EOF
MODE=standalone
EOF
cat > /etc/nut/ups.conf <<EOF
[${NUT_UPS_NAME}]
driver = usbhid-ups
port = auto
vendorid = ${NUT_VENDOR_ID}
productid = ${NUT_PRODUCT_ID}
pollinterval = 5
EOF
cat > /etc/nut/upsd.users <<EOF
[${NUT_MONITOR_USER}]
password = ${NUT_MONITOR_PASSWORD}
upsmon primary
EOF
chmod 0640 /etc/nut/upsd.users
if getent group nut >/dev/null 2>&1; then
chown root:nut /etc/nut/upsd.users
else
chown root:root /etc/nut/upsd.users
fi
cat > /etc/nut/upsmon.conf <<EOF
RUN_AS_USER nut
MONITOR ${NUT_UPS_NAME}@localhost 1 ${NUT_MONITOR_USER} ${NUT_MONITOR_PASSWORD} primary
MINSUPPLIES 1
SHUTDOWNCMD "/sbin/shutdown -h +0"
POLLFREQ 5
POLLFREQALERT 5
HOSTSYNC 15
DEADTIME 15
POWERDOWNFLAG /etc/killpower
EOF
cat > /etc/udev/rules.d/99-ananke-ups.rules <<EOF
# Managed by ananke install.sh: ensure UPS USB HID devices are readable by NUT
ACTION=="add|change", SUBSYSTEM=="usb", ATTR{idVendor}=="${NUT_VENDOR_ID}", ATTR{idProduct}=="${NUT_PRODUCT_ID}", MODE:="0660", GROUP:="nut"
EOF
udevadm control --reload-rules || true
udevadm trigger --subsystem-match=usb --attr-match=idVendor="${NUT_VENDOR_ID}" --attr-match=idProduct="${NUT_PRODUCT_ID}" || true
systemctl enable nut-driver-enumerator.service nut-server.service nut-monitor.service >/dev/null 2>&1 || true
systemctl restart nut-driver-enumerator.service >/dev/null 2>&1 || true
systemctl restart "nut-driver@${NUT_UPS_NAME}.service" >/dev/null 2>&1 || true
systemctl restart nut-server.service nut-monitor.service >/dev/null 2>&1 || true
}
ensure_dependencies
migrate_legacy_hecate_install
if [[ "${ENFORCE_QUALITY_GATE}" == "1" ]]; then
echo "[install] running quality gate"
"${REPO_DIR}/scripts/quality_gate.sh"
else
echo "[install] skipping quality gate (ANANKE_ENFORCE_QUALITY_GATE=${ENFORCE_QUALITY_GATE})"
fi
echo "[install] building ananke"
cd "${REPO_DIR}"
mkdir -p dist
BUILD_TARGET="$(resolve_build_target || true)"
if [[ -z "${BUILD_TARGET}" ]]; then
echo "[install] unable to find build target (expected cmd/ananke)." >&2
exit 1
fi
go build -o dist/ananke "${BUILD_TARGET}"
echo "[install] installing binary"
install -d -m 0755 "${BIN_DIR}"
install -m 0755 dist/ananke "${BIN_DIR}/ananke"
echo "[install] installing config + state dirs"
install -d -m 0750 "${CONF_DIR}"
install -d -m 0750 "${STATE_DIR}"
install -d -m 0750 "${STATE_DIR}/reports"
install -d -m 0755 "${LIB_DIR}"
if [[ -n "${FORCE_CONFIG_TEMPLATE}" ]]; then
case "${FORCE_CONFIG_TEMPLATE}" in
coordinator)
install_config_template coordinator "${CONF_DIR}/ananke.yaml"
echo "[install] forced config template: coordinator"
;;
peer)
install_config_template peer "${CONF_DIR}/ananke.yaml"
echo "[install] forced config template: peer"
;;
example)
install_config_template example "${CONF_DIR}/ananke.yaml"
echo "[install] forced config template: example"
;;
*)
echo "[install] unknown ANANKE_FORCE_CONFIG_TEMPLATE value: ${FORCE_CONFIG_TEMPLATE}" >&2
exit 1
;;
esac
elif [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
install_config_template example "${CONF_DIR}/ananke.yaml"
echo "[install] wrote default config to ${CONF_DIR}/ananke.yaml"
else
echo "[install] keeping existing config at ${CONF_DIR}/ananke.yaml"
fi
migrate_ananke_config
sanitize_migrated_ananke_config
ensure_ananke_ssh_identity
ensure_ananke_kubeconfig
echo "[install] installing systemd units"
install_systemd_units
install_self_update_script
resolve_nut_ups_name
configure_nut
systemctl daemon-reload
systemctl enable ananke.service ananke-update.timer
if [[ "${ENABLE_BOOTSTRAP}" == "1" ]]; then
systemctl enable ananke-bootstrap.service
elif [[ "${ENABLE_BOOTSTRAP}" == "0" ]]; then
systemctl disable ananke-bootstrap.service >/dev/null 2>&1 || true
else
role="$(read_ananke_role)"
systemctl enable ananke-bootstrap.service
echo "[install] auto-enabled ananke-bootstrap.service for role=${role}"
fi
if [[ "${START_NOW}" -eq 1 ]]; then
systemctl restart ananke.service
systemctl restart ananke-update.timer
echo "[install] ananke.service restarted"
fi
retire_legacy_hecate_install
systemctl daemon-reload
echo "[install] done"
echo "Next steps:"
echo " 1. Edit /etc/ananke/ananke.yaml"
echo " 2. Run: ananke status --config /etc/ananke/ananke.yaml"
echo " 3. Test dry run: ananke startup --config /etc/ananke/ananke.yaml"
echo " 4. Trigger bootstrap now (db host): systemctl start ananke-bootstrap.service"
echo " 5. Trigger self-update now: systemctl start ananke-update.service"