ananke/scripts/install-config-migration.sh

335 lines
16 KiB
Bash
Raw Permalink Normal View History

# Config migration helpers for the Ananke host installer.
read_ananke_role() {
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
echo "coordinator"
return 0
fi
local role
role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
if [[ -z "${role}" ]]; then
role="coordinator"
fi
echo "${role}"
}
migration_yaml_lookup() {
local key="$1"
awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
first_control_plane_name() {
awk '
/^control_planes:[[:space:]]*$/ {in_list=1; next}
in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
in_list && /^[^[:space:]]/ {in_list=0}
' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
lookup_node_host() {
local node="$1"
awk -F': *' -v n="${node}" '$1 == " " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
migrate_ananke_config() {
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
return 0
fi
local changed=0
local role_hint
role_hint="$(read_ananke_role)"
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
changed=1
fi
if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
changed=1
fi
if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/^ titan-24:[[:space:]]*tethys[[:space:]]*$/ titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated ssh_node_users titan-24 override to atlas"
changed=1
fi
if grep -Eq '^ command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ command_timeout_seconds:[[:space:]]*[0-9]+/a\ startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.startup_guard_max_age_seconds=900"
changed=1
fi
if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei \
-e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
-e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
"${CONF_DIR}/ananke.yaml"
echo "[install] removed deprecated host-poweroff shutdown config keys"
changed=1
fi
if grep -Eq '^ minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ minimum_battery_percent:[[:space:]]*[0-9.]+/a\ require_node_inventory_reachability: true\n node_inventory_reachability_wait_seconds: 300\n node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup node inventory reachability gate defaults"
changed=1
fi
if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
echo "[install] added state.reports_dir default"
changed=1
fi
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
local peer_host
peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
if [[ -n "${peer_host}" ]]; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
changed=1
fi
elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
changed=1
else
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts empty default"
changed=1
fi
fi
local default_restore_cp
default_restore_cp="$(first_control_plane_name)"
if [[ -z "${default_restore_cp}" ]]; then
default_restore_cp="titan-0a"
fi
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true\n auto_etcd_restore_on_api_failure: true\n etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
changed=1
fi
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup time sync + access reconciliation defaults"
changed=1
fi
if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup time sync quorum defaults"
changed=1
fi
if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup storage readiness defaults"
changed=1
fi
if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://scm.bstein.dev/api/healthz\n - https://metrics.bstein.dev/api/health\n require_service_checklist: true\n service_checklist_wait_seconds: 420\n service_checklist_poll_seconds: 5\n service_checklist_stability_seconds: 120\n service_checklist:\n - name: gitea-api\n url: https://scm.bstein.dev/api/healthz\n accepted_statuses: [200]\n body_contains: pass\n timeout_seconds: 12\n - name: grafana-api\n url: https://metrics.bstein.dev/api/health\n accepted_statuses: [200]\n body_contains: '\''\"database\":\"ok\"'\''\n timeout_seconds: 12\n vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup post-start probe + vault key fallback defaults"
changed=1
fi
if grep -Eq '^ - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
changed=1
fi
if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/api\/health$/a\ vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup.vault_unseal_key_file default"
changed=1
fi
fi
if ! grep -Eq '^ vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
if grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\ vault_unseal_breakglass_command: ""\n vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup break-glass fallback defaults"
changed=1
fi
fi
install_cluster_inventory_defaults "${role_hint}" && changed=1
if [[ "${changed}" -eq 1 ]]; then
chmod 0640 "${CONF_DIR}/ananke.yaml" || true
fi
}
install_cluster_inventory_defaults() {
local role="$1"
local changed=0
local inventory_block=""
local managed_block=""
local workers_block
workers_block='workers:
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
if [[ "${role}" == "coordinator" || "${role}" == "peer" ]]; then
inventory_block='ssh_node_hosts:
titan-db: 192.168.22.10
titan-0a: 192.168.22.11
titan-0b: 192.168.22.12
titan-0c: 192.168.22.13
titan-04: 192.168.22.30
titan-05: 192.168.22.31
titan-06: 192.168.22.32
titan-07: 192.168.22.33
titan-08: 192.168.22.34
titan-09: 192.168.22.35
titan-10: 192.168.22.36
titan-11: 192.168.22.37
titan-12: 192.168.22.40
titan-13: 192.168.22.41
titan-14: 192.168.22.42
titan-15: 192.168.22.43
titan-17: 192.168.22.45
titan-18: 192.168.22.46
titan-19: 192.168.22.47
titan-20: 192.168.22.20
titan-21: 192.168.22.21
titan-22: 192.168.22.22
titan-24: 192.168.22.26'
managed_block='ssh_managed_nodes:
- titan-db
- titan-0a
- titan-0b
- titan-0c
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
fi
if [[ -n "${inventory_block}" ]] && grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
changed=1
fi
if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated workers inventory for startup/shutdown orchestration"
changed=1
fi
if [[ -n "${managed_block}" ]]; then
if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
changed=1
fi
if ! grep -Eq '^ - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - titan-21$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n(?: - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
changed=1
fi
fi
if [[ "${role}" == "peer" ]]; then
install_peer_inventory_defaults && changed=1
fi
[[ "${changed}" -eq 1 ]]
}
install_peer_inventory_defaults() {
local changed=0
if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ - titan-db$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ - titan-24$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-04\n - titan-05\n - titan-06\n - titan-07\n - titan-08\n - titan-09\n - titan-10\n - titan-11\n - titan-12\n - titan-13\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-19\n - titan-20\n - titan-21\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
changed=1
fi
if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
changed=1
fi
[[ "${changed}" -eq 1 ]]
}
sanitize_migrated_ananke_config() {
local cfg="${CONF_DIR}/ananke.yaml"
[[ -f "${cfg}" ]] || return 0
local tmp changed=0
tmp="$(mktemp)"
# If a legacy migration bug appended root-level node entries after
# ssh_managed_nodes, drop those orphan entries until the next top-level key.
awk '
BEGIN {in_managed=0}
/^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
{
if (in_managed) {
if ($0 ~ /^ - /) {print; next}
if ($0 ~ /^- /) {next}
if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
}
print
}
' "${cfg}" > "${tmp}"
if ! cmp -s "${cfg}" "${tmp}"; then
mv "${tmp}" "${cfg}"
changed=1
echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
else
rm -f "${tmp}"
fi
if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
changed=1
echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
fi
if [[ "${changed}" -eq 1 ]]; then
chmod 0640 "${cfg}" || true
fi
}