#!/usr/bin/env bash set -euo pipefail KUBECTL="${KUBECTL:-kubectl}" HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}" HECATE_BIN="${HECATE_BIN:-/usr/local/bin/hecate}" HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}" LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}" STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}" EXECUTE=0 usage() { cat <<'EOF' Usage: scripts/hecate-drills.sh list scripts/hecate-drills.sh run [--execute] Drills: flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery. foundation-recovery Simulate vault/postgres/gitea outage and require layered restore. reconciliation-resume Simulate global Flux suspend + source-controller down and require resume. Notes: - Drills are intentionally disruptive and are not part of regular `make test`. - Use --execute to run live changes. Without it, this script prints planned actions only. EOF } log() { printf '[drill] %s\n' "$*" } die() { printf '[drill] ERROR: %s\n' "$*" >&2 exit 1 } need_cmd() { command -v "$1" >/dev/null 2>&1 || die "missing required command: $1" } now_ts() { date -u +%Y%m%dT%H%M%SZ } resource_key() { local ns="$1" kind="$2" name="$3" printf '%s|%s|%s' "$ns" "$kind" "$name" } get_replicas() { local ns="$1" kind="$2" name="$3" "${KUBECTL}" -n "$ns" get "$kind" "$name" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0" } scale_to() { local ns="$1" kind="$2" name="$3" replicas="$4" if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: kubectl -n ${ns} scale ${kind} ${name} --replicas=${replicas}" return 0 fi "${KUBECTL}" -n "$ns" scale "$kind" "$name" --replicas="${replicas}" >/dev/null } wait_ready() { local ns="$1" kind="$2" name="$3" timeout="$4" if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: kubectl -n ${ns} rollout status ${kind}/${name} --timeout=${timeout}" return 0 fi "${KUBECTL}" -n "$ns" rollout status "${kind}/${name}" --timeout="${timeout}" >/dev/null } run_hecate_startup() { local reason="$1" local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main) if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'" return 0 fi timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}" } declare -A SNAPSHOT_REPLICAS=() SUSPENDED_KS_BEFORE="" SUSPENDED_HR_BEFORE="" snapshot_resources() { local resources=("$@") SNAPSHOT_REPLICAS=() for res in "${resources[@]}"; do IFS='|' read -r ns kind name _ <<<"${res}" SNAPSHOT_REPLICAS["$(resource_key "$ns" "$kind" "$name")"]="$(get_replicas "$ns" "$kind" "$name")" done } restore_resources() { local resources=("$@") for res in "${resources[@]}"; do IFS='|' read -r ns kind name _ <<<"${res}" local key key="$(resource_key "$ns" "$kind" "$name")" local replicas="${SNAPSHOT_REPLICAS[${key}]:-1}" log "rollback replicas: ${ns}/${kind}/${name} -> ${replicas}" scale_to "$ns" "$kind" "$name" "$replicas" || true done } record_flux_suspended_before() { SUSPENDED_KS_BEFORE="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)" SUSPENDED_HR_BEFORE="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)" } set_flux_suspend_all() { local value="$1" if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: patch all Flux kustomizations + helmreleases suspend=${value}" return 0 fi local patch patch="$(printf '{"spec":{"suspend":%s}}' "${value}")" while read -r ks; do [[ -z "${ks}" ]] && continue "${KUBECTL}" -n flux-system patch kustomization "${ks}" --type=merge -p "${patch}" >/dev/null || true done < <("${KUBECTL}" -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') while read -r hr; do [[ -z "${hr}" ]] && continue local ns="${hr%%/*}" local name="${hr##*/}" "${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true done < <("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}') } restore_flux_suspended_before() { set_flux_suspend_all false if [[ "${EXECUTE}" -eq 0 ]]; then return 0 fi local patch='{"spec":{"suspend":true}}' while read -r ref; do [[ -z "${ref}" ]] && continue local ns="${ref%%/*}" local name="${ref##*/}" "${KUBECTL}" -n "${ns}" patch kustomization "${name}" --type=merge -p "${patch}" >/dev/null || true done <<<"${SUSPENDED_KS_BEFORE}" while read -r ref; do [[ -z "${ref}" ]] && continue local ns="${ref%%/*}" local name="${ref##*/}" "${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true done <<<"${SUSPENDED_HR_BEFORE}" } verify_flux_unsuspended() { if [[ "${EXECUTE}" -eq 0 ]]; then log "plan: verify no Flux kustomizations/helmreleases remain suspended" return 0 fi local ks_count hr_count ks_count="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')" hr_count="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')" [[ "${ks_count}" == "0" ]] || die "kustomizations still suspended: ${ks_count}" [[ "${hr_count}" == "0" ]] || die "helmreleases still suspended: ${hr_count}" } write_log_header() { local drill="$1" mkdir -p "${LOG_DIR}" local f="${LOG_DIR}/${drill}-$(now_ts).log" exec > >(tee -a "${f}") 2>&1 log "drill=${drill} execute=${EXECUTE} coordinator=${HECATE_COORDINATOR_HOST}" } run_drill_flux_gitea_deadlock() { local resources=( "flux-system|deployment|source-controller|1" "flux-system|deployment|kustomize-controller|1" "flux-system|deployment|helm-controller|1" "flux-system|deployment|notification-controller|1" "gitea|deployment|gitea|1" ) snapshot_resources "${resources[@]}" trap 'restore_resources "${resources[@]}"' ERR log "injecting outage: flux controllers + gitea" for res in "${resources[@]}"; do IFS='|' read -r ns kind name _ <<<"${res}" scale_to "$ns" "$kind" "$name" 0 done run_hecate_startup "drill-flux-gitea-deadlock" log "verifying recovery" wait_ready flux-system deployment source-controller 240s wait_ready flux-system deployment kustomize-controller 240s wait_ready flux-system deployment helm-controller 240s wait_ready flux-system deployment notification-controller 240s wait_ready gitea deployment gitea 300s log "pass: flux-gitea-deadlock" trap - ERR } run_drill_foundation_recovery() { local resources=( "vault|statefulset|vault|1" "postgres|statefulset|postgres|1" "gitea|deployment|gitea|1" ) snapshot_resources "${resources[@]}" trap 'restore_resources "${resources[@]}"' ERR log "injecting outage: vault + postgres + gitea" for res in "${resources[@]}"; do IFS='|' read -r ns kind name _ <<<"${res}" scale_to "$ns" "$kind" "$name" 0 done run_hecate_startup "drill-foundation-recovery" log "verifying layered recovery" wait_ready vault statefulset vault 420s wait_ready postgres statefulset postgres 420s wait_ready gitea deployment gitea 300s log "pass: foundation-recovery" trap - ERR } run_drill_reconciliation_resume() { local resources=("flux-system|deployment|source-controller|1") snapshot_resources "${resources[@]}" record_flux_suspended_before trap 'restore_flux_suspended_before; restore_resources "${resources[@]}"' ERR log "injecting outage: suspend all Flux objects + stop source-controller" set_flux_suspend_all true scale_to flux-system deployment source-controller 0 run_hecate_startup "drill-reconciliation-resume" log "verifying reconciliation resumed" wait_ready flux-system deployment source-controller 240s verify_flux_unsuspended log "pass: reconciliation-resume" trap - ERR } main() { need_cmd "${KUBECTL}" need_cmd ssh need_cmd timeout local cmd="${1:-}" case "${cmd}" in list) usage exit 0 ;; run) shift || true local drill="${1:-}" [[ -n "${drill}" ]] || die "missing drill name" shift || true while [[ $# -gt 0 ]]; do case "$1" in --execute) EXECUTE=1 ;; *) die "unknown option: $1" ;; esac shift done write_log_header "${drill}" ;; *) usage exit 2 ;; esac case "${drill}" in flux-gitea-deadlock) run_drill_flux_gitea_deadlock ;; foundation-recovery) run_drill_foundation_recovery ;; reconciliation-resume) run_drill_reconciliation_resume ;; *) die "unknown drill: ${drill}" ;; esac } main "$@"