#!/usr/bin/env bash set -Eeuo pipefail KUBECTL="${KUBECTL:-kubectl}" ANANKE_COORDINATOR_HOST="${ANANKE_COORDINATOR_HOST:-titan-db}" ANANKE_BIN="${ANANKE_BIN:-/usr/local/bin/ananke}" ANANKE_CONFIG="${ANANKE_CONFIG:-/etc/ananke/ananke.yaml}" ANANKE_COORDINATOR_RELAY="${ANANKE_COORDINATOR_RELAY:-}" LOG_DIR="${ANANKE_DRILL_LOG_DIR:-/tmp/ananke-drills}" STARTUP_TIMEOUT_SECONDS="${ANANKE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}" SHUTDOWN_TIMEOUT_SECONDS="${ANANKE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}" SHUTDOWN_CONFIG="${ANANKE_DRILL_SHUTDOWN_CONFIG:-/tmp/ananke-drill-cluster-only.yaml}" STARTUP_RETRY_DELAY_SECONDS="${ANANKE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}" STARTUP_RETRY_MAX="${ANANKE_DRILL_STARTUP_RETRY_MAX:-12}" EXECUTE=0 usage() { cat <<'EOF' Usage: scripts/ananke-drills.sh list scripts/ananke-drills.sh run [--execute] Drills: flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery. foundation-recovery Simulate vault/postgres/gitea outage and require layered restore. reconciliation-resume Simulate global Flux suspend + source-controller down and require resume. startup-intent-guard Assert startup is blocked when shutdown intent is active. controlled-cycle Run full shutdown->startup recovery cycle (uses cluster-only shutdown config). Notes: - Drills are intentionally disruptive and are not part of regular `make test`. - Use --execute to run live changes. Without it, this script prints planned actions only. - Optional relay: set ANANKE_COORDINATOR_RELAY="ssh titan-db" to run coordinator commands via a jump host. EOF } log() { printf '[drill] %s\n' "$*" } die() { printf '[drill] ERROR: %s\n' "$*" >&2 exit 1 } need_cmd() { command -v "$1" >/dev/null 2>&1 || die "missing required command: $1" } now_ts() { date -u +%Y%m%dT%H%M%SZ } resource_key() { local ns="$1" kind="$2" name="$3" printf '%s|%s|%s' "$ns" "$kind" "$name" } get_replicas() { local ns="$1" kind="$2" name="$3" "${KUBECTL}" -n "$ns" get "$kind" "$name" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0" } scale_to() { local ns="$1" kind="$2" name="$3" replicas="$4" if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: kubectl -n ${ns} scale ${kind} ${name} --replicas=${replicas}" return 0 fi "${KUBECTL}" -n "$ns" scale "$kind" "$name" --replicas="${replicas}" >/dev/null } wait_ready() { local ns="$1" kind="$2" name="$3" timeout="$4" if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: kubectl -n ${ns} rollout status ${kind}/${name} --timeout=${timeout}" return 0 fi "${KUBECTL}" -n "$ns" rollout status "${kind}/${name}" --timeout="${timeout}" >/dev/null } wait_ready_keycloak() { local timeout="$1" if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: wait for sso keycloak rollout (${timeout}) [deployment preferred, fallback to statefulset]" return 0 fi if "${KUBECTL}" -n sso get deployment keycloak >/dev/null 2>&1; then wait_ready sso deployment keycloak "${timeout}" return 0 fi if "${KUBECTL}" -n sso get statefulset keycloak >/dev/null 2>&1; then wait_ready sso statefulset keycloak "${timeout}" return 0 fi die "keycloak workload not found in sso namespace (expected deployment/keycloak or statefulset/keycloak)" } run_ananke_startup() { local reason="$1" local cmd=(sudo "${ANANKE_BIN}" startup --config "${ANANKE_CONFIG}" --execute --force-flux-branch main --reason "${reason}") if [[ "${EXECUTE}" -eq 0 ]]; then if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then log "plan: ssh ${ANANKE_COORDINATOR_HOST} ${ANANKE_COORDINATOR_RELAY} '${cmd[*]}'" else log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${cmd[*]}'" fi return 0 fi if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then # shellcheck disable=SC2086 timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "${cmd[@]}" else timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" "${cmd[@]}" fi } run_ananke_shutdown() { local reason="$1" local cmd=(sudo "${ANANKE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}") if [[ "${EXECUTE}" -eq 0 ]]; then if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then log "plan: ssh ${ANANKE_COORDINATOR_HOST} ${ANANKE_COORDINATOR_RELAY} '${cmd[*]}'" else log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${cmd[*]}'" fi return 0 fi if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then # shellcheck disable=SC2086 timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "${cmd[@]}" else timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${ANANKE_COORDINATOR_HOST}" "${cmd[@]}" fi } run_ananke_startup_with_retry() { local reason="$1" local startup_cmd="sudo ${ANANKE_BIN} startup --config ${ANANKE_CONFIG} --execute --force-flux-branch main --reason ${reason}" if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: startup retry loop with max=${STARTUP_RETRY_MAX} delay=${STARTUP_RETRY_DELAY_SECONDS}s" return 0 fi local attempt for attempt in $(seq 1 "${STARTUP_RETRY_MAX}"); do log "startup attempt ${attempt}/${STARTUP_RETRY_MAX}" if run_coordinator_bash "${startup_cmd}"; then return 0 fi if [[ "${attempt}" -lt "${STARTUP_RETRY_MAX}" ]]; then log "startup attempt ${attempt} failed; retrying in ${STARTUP_RETRY_DELAY_SECONDS}s" sleep "${STARTUP_RETRY_DELAY_SECONDS}" fi done die "startup failed after ${STARTUP_RETRY_MAX} attempts" } run_coordinator_bash() { local script="$1" if [[ -n "${ANANKE_COORDINATOR_RELAY}" ]]; then # shellcheck disable=SC2086 printf '%s\n' "${script}" | ssh "${ANANKE_COORDINATOR_HOST}" ${ANANKE_COORDINATOR_RELAY} "bash -se" else printf '%s\n' "${script}" | ssh "${ANANKE_COORDINATOR_HOST}" "bash -se" fi } declare -A SNAPSHOT_REPLICAS=() SUSPENDED_KS_BEFORE="" SUSPENDED_HR_BEFORE="" CURRENT_DRILL="" CURRENT_RESOURCES=() ROLLBACK_FLUX_SUSPEND=0 on_err() { local code=$? log "failure detected in drill '${CURRENT_DRILL}' (exit=${code}); starting rollback" if [[ "${ROLLBACK_FLUX_SUSPEND}" -eq 1 ]]; then restore_flux_suspended_before || true fi if [[ ${#CURRENT_RESOURCES[@]} -gt 0 ]]; then restore_resources "${CURRENT_RESOURCES[@]}" || true fi exit "${code}" } snapshot_resources() { local resources=("$@") SNAPSHOT_REPLICAS=() for res in "${resources[@]}"; do IFS='|' read -r ns kind name _ <<<"${res}" SNAPSHOT_REPLICAS["$(resource_key "$ns" "$kind" "$name")"]="$(get_replicas "$ns" "$kind" "$name")" done } restore_resources() { local resources=("$@") for res in "${resources[@]}"; do IFS='|' read -r ns kind name _ <<<"${res}" local key key="$(resource_key "$ns" "$kind" "$name")" local replicas="${SNAPSHOT_REPLICAS[${key}]:-1}" log "rollback replicas: ${ns}/${kind}/${name} -> ${replicas}" scale_to "$ns" "$kind" "$name" "$replicas" || true done } record_flux_suspended_before() { SUSPENDED_KS_BEFORE="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)" SUSPENDED_HR_BEFORE="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)" } set_flux_suspend_all() { local value="$1" if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: patch all Flux kustomizations + helmreleases suspend=${value}" return 0 fi local patch patch="$(printf '{"spec":{"suspend":%s}}' "${value}")" while read -r ks; do [[ -z "${ks}" ]] && continue "${KUBECTL}" -n flux-system patch kustomization "${ks}" --type=merge -p "${patch}" >/dev/null || true done < <("${KUBECTL}" -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') while read -r hr; do [[ -z "${hr}" ]] && continue local ns="${hr%%/*}" local name="${hr##*/}" "${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true done < <("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}') } restore_flux_suspended_before() { set_flux_suspend_all false if [[ "${EXECUTE}" -eq 0 ]]; then return 0 fi local patch='{"spec":{"suspend":true}}' while read -r ref; do [[ -z "${ref}" ]] && continue local ns="${ref%%/*}" local name="${ref##*/}" "${KUBECTL}" -n "${ns}" patch kustomization "${name}" --type=merge -p "${patch}" >/dev/null || true done <<<"${SUSPENDED_KS_BEFORE}" while read -r ref; do [[ -z "${ref}" ]] && continue local ns="${ref%%/*}" local name="${ref##*/}" "${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true done <<<"${SUSPENDED_HR_BEFORE}" } normalize_lines() { sed '/^$/d' | sort } verify_flux_suspend_state_restored() { if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: verify Flux suspended objects match pre-drill state" return 0 fi local current_ks current_hr current_ks="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)" current_hr="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)" local expected_ks expected_hr got_ks got_hr expected_ks="$(printf '%s\n' "${SUSPENDED_KS_BEFORE}" | normalize_lines)" expected_hr="$(printf '%s\n' "${SUSPENDED_HR_BEFORE}" | normalize_lines)" got_ks="$(printf '%s\n' "${current_ks}" | normalize_lines)" got_hr="$(printf '%s\n' "${current_hr}" | normalize_lines)" [[ "${got_ks}" == "${expected_ks}" ]] || die "kustomization suspend-state drift detected" [[ "${got_hr}" == "${expected_hr}" ]] || die "helmrelease suspend-state drift detected" } write_log_header() { local drill="$1" mkdir -p "${LOG_DIR}" local f="${LOG_DIR}/${drill}-$(now_ts).log" exec > >(tee -a "${f}") 2>&1 log "drill=${drill} execute=${EXECUTE} coordinator=${ANANKE_COORDINATOR_HOST}" } run_drill_flux_gitea_deadlock() { CURRENT_RESOURCES=( "flux-system|deployment|source-controller|1" "flux-system|deployment|kustomize-controller|1" "flux-system|deployment|helm-controller|1" "flux-system|deployment|notification-controller|1" "gitea|deployment|gitea|1" ) snapshot_resources "${CURRENT_RESOURCES[@]}" ROLLBACK_FLUX_SUSPEND=0 log "injecting outage: flux controllers + gitea" for res in "${CURRENT_RESOURCES[@]}"; do IFS='|' read -r ns kind name _ <<<"${res}" scale_to "$ns" "$kind" "$name" 0 done run_ananke_startup "drill-flux-gitea-deadlock" log "verifying recovery" wait_ready flux-system deployment source-controller 240s wait_ready flux-system deployment kustomize-controller 240s wait_ready flux-system deployment helm-controller 240s wait_ready flux-system deployment notification-controller 240s wait_ready gitea deployment gitea 300s log "pass: flux-gitea-deadlock" CURRENT_RESOURCES=() } run_drill_foundation_recovery() { CURRENT_RESOURCES=( "vault|statefulset|vault|1" "postgres|statefulset|postgres|1" "gitea|deployment|gitea|1" ) snapshot_resources "${CURRENT_RESOURCES[@]}" ROLLBACK_FLUX_SUSPEND=0 log "injecting outage: vault + postgres + gitea" for res in "${CURRENT_RESOURCES[@]}"; do IFS='|' read -r ns kind name _ <<<"${res}" scale_to "$ns" "$kind" "$name" 0 done run_ananke_startup "drill-foundation-recovery" log "verifying layered recovery" wait_ready vault statefulset vault 420s wait_ready postgres statefulset postgres 420s wait_ready gitea deployment gitea 300s log "pass: foundation-recovery" CURRENT_RESOURCES=() } run_drill_reconciliation_resume() { CURRENT_RESOURCES=("flux-system|deployment|source-controller|1") snapshot_resources "${CURRENT_RESOURCES[@]}" record_flux_suspended_before ROLLBACK_FLUX_SUSPEND=1 log "injecting outage: suspend all Flux objects + stop source-controller" set_flux_suspend_all true scale_to flux-system deployment source-controller 0 run_ananke_startup "drill-reconciliation-resume" log "verifying reconciliation resumed" wait_ready flux-system deployment source-controller 240s verify_flux_suspend_state_restored log "pass: reconciliation-resume" CURRENT_RESOURCES=() ROLLBACK_FLUX_SUSPEND=0 } run_drill_startup_intent_guard() { local intent_path="/var/lib/ananke/intent.json" local backup_path="/tmp/ananke-intent-pre-drill.json" local inject_cmd=" if [ -f '${intent_path}' ]; then sudo cp '${intent_path}' '${backup_path}'; else sudo rm -f '${backup_path}'; fi cat <<'JSON' | sudo tee '${intent_path}' >/dev/null {\"state\":\"shutting_down\",\"reason\":\"drill-intent-guard\",\"source\":\"drill\",\"updated_at\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"} JSON " local restore_cmd=" if [ -f '${backup_path}' ]; then sudo mv '${backup_path}' '${intent_path}' else sudo rm -f '${intent_path}' fi " local startup_cmd="sudo ${ANANKE_BIN} startup --config ${ANANKE_CONFIG} --execute --force-flux-branch main --reason drill-startup-intent-guard" if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: ssh ${ANANKE_COORDINATOR_HOST} ''" log "plan: ssh ${ANANKE_COORDINATOR_HOST} '${startup_cmd}' (expect failure)" log "plan: ssh ${ANANKE_COORDINATOR_HOST} ''" log "pass: startup-intent-guard (plan mode)" return 0 fi run_coordinator_bash "${inject_cmd}" if run_coordinator_bash "${startup_cmd}"; then run_coordinator_bash "${restore_cmd}" || true die "startup-intent-guard failed: startup unexpectedly succeeded while shutdown intent was active" fi run_coordinator_bash "${restore_cmd}" log "pass: startup-intent-guard" } run_drill_controlled_cycle() { CURRENT_RESOURCES=() ROLLBACK_FLUX_SUSPEND=0 if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: verify shutdown drill config exists on coordinator (${SHUTDOWN_CONFIG})" else run_coordinator_bash "[ -s '${SHUTDOWN_CONFIG}' ]" || die "shutdown drill config missing on coordinator: ${SHUTDOWN_CONFIG}" fi log "running controlled shutdown cycle (cluster-only shutdown config)" run_ananke_shutdown "drill-controlled-cycle-shutdown" log "running startup recovery cycle" run_ananke_startup_with_retry "drill-controlled-cycle-startup" log "verifying critical stack readiness after cycle" wait_ready flux-system deployment source-controller 240s wait_ready flux-system deployment kustomize-controller 240s wait_ready flux-system deployment helm-controller 240s wait_ready flux-system deployment notification-controller 240s wait_ready vault statefulset vault 420s wait_ready postgres statefulset postgres 420s wait_ready gitea deployment gitea 300s wait_ready_keycloak 420s wait_ready maintenance deployment metis 300s log "pass: controlled-cycle" } main() { need_cmd "${KUBECTL}" need_cmd ssh need_cmd timeout trap on_err ERR local cmd="${1:-}" case "${cmd}" in list) usage exit 0 ;; run) shift || true local drill="${1:-}" [[ -n "${drill}" ]] || die "missing drill name" shift || true while [[ $# -gt 0 ]]; do case "$1" in --execute) EXECUTE=1 ;; *) die "unknown option: $1" ;; esac shift done write_log_header "${drill}" CURRENT_DRILL="${drill}" ;; *) usage exit 2 ;; esac case "${drill}" in flux-gitea-deadlock) run_drill_flux_gitea_deadlock ;; foundation-recovery) run_drill_foundation_recovery ;; reconciliation-resume) run_drill_reconciliation_resume ;; startup-intent-guard) run_drill_startup_intent_guard ;; controlled-cycle) run_drill_controlled_cycle ;; *) die "unknown drill: ${drill}" ;; esac } main "$@"