ananke/scripts/hecate-drills.sh

296 lines
9.0 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
KUBECTL="${KUBECTL:-kubectl}"
HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}"
HECATE_BIN="${HECATE_BIN:-/usr/local/bin/hecate}"
HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}"
LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}"
STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
EXECUTE=0
usage() {
cat <<'EOF'
Usage:
scripts/hecate-drills.sh list
scripts/hecate-drills.sh run <drill-name> [--execute]
Drills:
flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery.
foundation-recovery Simulate vault/postgres/gitea outage and require layered restore.
reconciliation-resume Simulate global Flux suspend + source-controller down and require resume.
Notes:
- Drills are intentionally disruptive and are not part of regular `make test`.
- Use --execute to run live changes. Without it, this script prints planned actions only.
EOF
}
log() {
printf '[drill] %s\n' "$*"
}
die() {
printf '[drill] ERROR: %s\n' "$*" >&2
exit 1
}
need_cmd() {
command -v "$1" >/dev/null 2>&1 || die "missing required command: $1"
}
now_ts() {
date -u +%Y%m%dT%H%M%SZ
}
resource_key() {
local ns="$1" kind="$2" name="$3"
printf '%s|%s|%s' "$ns" "$kind" "$name"
}
get_replicas() {
local ns="$1" kind="$2" name="$3"
"${KUBECTL}" -n "$ns" get "$kind" "$name" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0"
}
scale_to() {
local ns="$1" kind="$2" name="$3" replicas="$4"
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: kubectl -n ${ns} scale ${kind} ${name} --replicas=${replicas}"
return 0
fi
"${KUBECTL}" -n "$ns" scale "$kind" "$name" --replicas="${replicas}" >/dev/null
}
wait_ready() {
local ns="$1" kind="$2" name="$3" timeout="$4"
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: kubectl -n ${ns} rollout status ${kind}/${name} --timeout=${timeout}"
return 0
fi
"${KUBECTL}" -n "$ns" rollout status "${kind}/${name}" --timeout="${timeout}" >/dev/null
}
run_hecate_startup() {
local reason="$1"
local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main)
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
return 0
fi
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}"
}
declare -A SNAPSHOT_REPLICAS=()
SUSPENDED_KS_BEFORE=""
SUSPENDED_HR_BEFORE=""
snapshot_resources() {
local resources=("$@")
SNAPSHOT_REPLICAS=()
for res in "${resources[@]}"; do
IFS='|' read -r ns kind name _ <<<"${res}"
SNAPSHOT_REPLICAS["$(resource_key "$ns" "$kind" "$name")"]="$(get_replicas "$ns" "$kind" "$name")"
done
}
restore_resources() {
local resources=("$@")
for res in "${resources[@]}"; do
IFS='|' read -r ns kind name _ <<<"${res}"
local key
key="$(resource_key "$ns" "$kind" "$name")"
local replicas="${SNAPSHOT_REPLICAS[${key}]:-1}"
log "rollback replicas: ${ns}/${kind}/${name} -> ${replicas}"
scale_to "$ns" "$kind" "$name" "$replicas" || true
done
}
record_flux_suspended_before() {
SUSPENDED_KS_BEFORE="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
SUSPENDED_HR_BEFORE="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
}
set_flux_suspend_all() {
local value="$1"
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: patch all Flux kustomizations + helmreleases suspend=${value}"
return 0
fi
local patch
patch="$(printf '{"spec":{"suspend":%s}}' "${value}")"
while read -r ks; do
[[ -z "${ks}" ]] && continue
"${KUBECTL}" -n flux-system patch kustomization "${ks}" --type=merge -p "${patch}" >/dev/null || true
done < <("${KUBECTL}" -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')
while read -r hr; do
[[ -z "${hr}" ]] && continue
local ns="${hr%%/*}"
local name="${hr##*/}"
"${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true
done < <("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}')
}
restore_flux_suspended_before() {
set_flux_suspend_all false
if [[ "${EXECUTE}" -eq 0 ]]; then
return 0
fi
local patch='{"spec":{"suspend":true}}'
while read -r ref; do
[[ -z "${ref}" ]] && continue
local ns="${ref%%/*}"
local name="${ref##*/}"
"${KUBECTL}" -n "${ns}" patch kustomization "${name}" --type=merge -p "${patch}" >/dev/null || true
done <<<"${SUSPENDED_KS_BEFORE}"
while read -r ref; do
[[ -z "${ref}" ]] && continue
local ns="${ref%%/*}"
local name="${ref##*/}"
"${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true
done <<<"${SUSPENDED_HR_BEFORE}"
}
verify_flux_unsuspended() {
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: verify no Flux kustomizations/helmreleases remain suspended"
return 0
fi
local ks_count hr_count
ks_count="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')"
hr_count="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')"
[[ "${ks_count}" == "0" ]] || die "kustomizations still suspended: ${ks_count}"
[[ "${hr_count}" == "0" ]] || die "helmreleases still suspended: ${hr_count}"
}
write_log_header() {
local drill="$1"
mkdir -p "${LOG_DIR}"
local f="${LOG_DIR}/${drill}-$(now_ts).log"
exec > >(tee -a "${f}") 2>&1
log "drill=${drill} execute=${EXECUTE} coordinator=${HECATE_COORDINATOR_HOST}"
}
run_drill_flux_gitea_deadlock() {
local resources=(
"flux-system|deployment|source-controller|1"
"flux-system|deployment|kustomize-controller|1"
"flux-system|deployment|helm-controller|1"
"flux-system|deployment|notification-controller|1"
"gitea|deployment|gitea|1"
)
snapshot_resources "${resources[@]}"
trap 'restore_resources "${resources[@]}"' ERR
log "injecting outage: flux controllers + gitea"
for res in "${resources[@]}"; do
IFS='|' read -r ns kind name _ <<<"${res}"
scale_to "$ns" "$kind" "$name" 0
done
run_hecate_startup "drill-flux-gitea-deadlock"
log "verifying recovery"
wait_ready flux-system deployment source-controller 240s
wait_ready flux-system deployment kustomize-controller 240s
wait_ready flux-system deployment helm-controller 240s
wait_ready flux-system deployment notification-controller 240s
wait_ready gitea deployment gitea 300s
log "pass: flux-gitea-deadlock"
trap - ERR
}
run_drill_foundation_recovery() {
local resources=(
"vault|statefulset|vault|1"
"postgres|statefulset|postgres|1"
"gitea|deployment|gitea|1"
)
snapshot_resources "${resources[@]}"
trap 'restore_resources "${resources[@]}"' ERR
log "injecting outage: vault + postgres + gitea"
for res in "${resources[@]}"; do
IFS='|' read -r ns kind name _ <<<"${res}"
scale_to "$ns" "$kind" "$name" 0
done
run_hecate_startup "drill-foundation-recovery"
log "verifying layered recovery"
wait_ready vault statefulset vault 420s
wait_ready postgres statefulset postgres 420s
wait_ready gitea deployment gitea 300s
log "pass: foundation-recovery"
trap - ERR
}
run_drill_reconciliation_resume() {
local resources=("flux-system|deployment|source-controller|1")
snapshot_resources "${resources[@]}"
record_flux_suspended_before
trap 'restore_flux_suspended_before; restore_resources "${resources[@]}"' ERR
log "injecting outage: suspend all Flux objects + stop source-controller"
set_flux_suspend_all true
scale_to flux-system deployment source-controller 0
run_hecate_startup "drill-reconciliation-resume"
log "verifying reconciliation resumed"
wait_ready flux-system deployment source-controller 240s
verify_flux_unsuspended
log "pass: reconciliation-resume"
trap - ERR
}
main() {
need_cmd "${KUBECTL}"
need_cmd ssh
need_cmd timeout
local cmd="${1:-}"
case "${cmd}" in
list)
usage
exit 0
;;
run)
shift || true
local drill="${1:-}"
[[ -n "${drill}" ]] || die "missing drill name"
shift || true
while [[ $# -gt 0 ]]; do
case "$1" in
--execute) EXECUTE=1 ;;
*) die "unknown option: $1" ;;
esac
shift
done
write_log_header "${drill}"
;;
*)
usage
exit 2
;;
esac
case "${drill}" in
flux-gitea-deadlock)
run_drill_flux_gitea_deadlock
;;
foundation-recovery)
run_drill_foundation_recovery
;;
reconciliation-resume)
run_drill_reconciliation_resume
;;
*)
die "unknown drill: ${drill}"
;;
esac
}
main "$@"