364 lines
12 KiB
Bash
Executable File
364 lines
12 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -Eeuo pipefail
|
|
|
|
KUBECTL="${KUBECTL:-kubectl}"
|
|
HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}"
|
|
HECATE_BIN="${HECATE_BIN:-/usr/local/bin/hecate}"
|
|
HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}"
|
|
LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}"
|
|
STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
|
|
EXECUTE=0
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage:
|
|
scripts/hecate-drills.sh list
|
|
scripts/hecate-drills.sh run <drill-name> [--execute]
|
|
|
|
Drills:
|
|
flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery.
|
|
foundation-recovery Simulate vault/postgres/gitea outage and require layered restore.
|
|
reconciliation-resume Simulate global Flux suspend + source-controller down and require resume.
|
|
startup-intent-guard Assert startup is blocked when shutdown intent is active.
|
|
|
|
Notes:
|
|
- Drills are intentionally disruptive and are not part of regular `make test`.
|
|
- Use --execute to run live changes. Without it, this script prints planned actions only.
|
|
EOF
|
|
}
|
|
|
|
log() {
|
|
printf '[drill] %s\n' "$*"
|
|
}
|
|
|
|
die() {
|
|
printf '[drill] ERROR: %s\n' "$*" >&2
|
|
exit 1
|
|
}
|
|
|
|
need_cmd() {
|
|
command -v "$1" >/dev/null 2>&1 || die "missing required command: $1"
|
|
}
|
|
|
|
now_ts() {
|
|
date -u +%Y%m%dT%H%M%SZ
|
|
}
|
|
|
|
resource_key() {
|
|
local ns="$1" kind="$2" name="$3"
|
|
printf '%s|%s|%s' "$ns" "$kind" "$name"
|
|
}
|
|
|
|
get_replicas() {
|
|
local ns="$1" kind="$2" name="$3"
|
|
"${KUBECTL}" -n "$ns" get "$kind" "$name" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0"
|
|
}
|
|
|
|
scale_to() {
|
|
local ns="$1" kind="$2" name="$3" replicas="$4"
|
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
|
log "plan: kubectl -n ${ns} scale ${kind} ${name} --replicas=${replicas}"
|
|
return 0
|
|
fi
|
|
"${KUBECTL}" -n "$ns" scale "$kind" "$name" --replicas="${replicas}" >/dev/null
|
|
}
|
|
|
|
wait_ready() {
|
|
local ns="$1" kind="$2" name="$3" timeout="$4"
|
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
|
log "plan: kubectl -n ${ns} rollout status ${kind}/${name} --timeout=${timeout}"
|
|
return 0
|
|
fi
|
|
"${KUBECTL}" -n "$ns" rollout status "${kind}/${name}" --timeout="${timeout}" >/dev/null
|
|
}
|
|
|
|
run_hecate_startup() {
|
|
local reason="$1"
|
|
local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main --reason "${reason}")
|
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
|
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
|
|
return 0
|
|
fi
|
|
timeout "${STARTUP_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}"
|
|
}
|
|
|
|
declare -A SNAPSHOT_REPLICAS=()
|
|
SUSPENDED_KS_BEFORE=""
|
|
SUSPENDED_HR_BEFORE=""
|
|
CURRENT_DRILL=""
|
|
CURRENT_RESOURCES=()
|
|
ROLLBACK_FLUX_SUSPEND=0
|
|
|
|
on_err() {
|
|
local code=$?
|
|
log "failure detected in drill '${CURRENT_DRILL}' (exit=${code}); starting rollback"
|
|
if [[ "${ROLLBACK_FLUX_SUSPEND}" -eq 1 ]]; then
|
|
restore_flux_suspended_before || true
|
|
fi
|
|
if [[ ${#CURRENT_RESOURCES[@]} -gt 0 ]]; then
|
|
restore_resources "${CURRENT_RESOURCES[@]}" || true
|
|
fi
|
|
exit "${code}"
|
|
}
|
|
|
|
snapshot_resources() {
|
|
local resources=("$@")
|
|
SNAPSHOT_REPLICAS=()
|
|
for res in "${resources[@]}"; do
|
|
IFS='|' read -r ns kind name _ <<<"${res}"
|
|
SNAPSHOT_REPLICAS["$(resource_key "$ns" "$kind" "$name")"]="$(get_replicas "$ns" "$kind" "$name")"
|
|
done
|
|
}
|
|
|
|
restore_resources() {
|
|
local resources=("$@")
|
|
for res in "${resources[@]}"; do
|
|
IFS='|' read -r ns kind name _ <<<"${res}"
|
|
local key
|
|
key="$(resource_key "$ns" "$kind" "$name")"
|
|
local replicas="${SNAPSHOT_REPLICAS[${key}]:-1}"
|
|
log "rollback replicas: ${ns}/${kind}/${name} -> ${replicas}"
|
|
scale_to "$ns" "$kind" "$name" "$replicas" || true
|
|
done
|
|
}
|
|
|
|
record_flux_suspended_before() {
|
|
SUSPENDED_KS_BEFORE="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
|
|
SUSPENDED_HR_BEFORE="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
|
|
}
|
|
|
|
set_flux_suspend_all() {
|
|
local value="$1"
|
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
|
log "plan: patch all Flux kustomizations + helmreleases suspend=${value}"
|
|
return 0
|
|
fi
|
|
local patch
|
|
patch="$(printf '{"spec":{"suspend":%s}}' "${value}")"
|
|
while read -r ks; do
|
|
[[ -z "${ks}" ]] && continue
|
|
"${KUBECTL}" -n flux-system patch kustomization "${ks}" --type=merge -p "${patch}" >/dev/null || true
|
|
done < <("${KUBECTL}" -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')
|
|
|
|
while read -r hr; do
|
|
[[ -z "${hr}" ]] && continue
|
|
local ns="${hr%%/*}"
|
|
local name="${hr##*/}"
|
|
"${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true
|
|
done < <("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}')
|
|
}
|
|
|
|
restore_flux_suspended_before() {
|
|
set_flux_suspend_all false
|
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
|
return 0
|
|
fi
|
|
local patch='{"spec":{"suspend":true}}'
|
|
while read -r ref; do
|
|
[[ -z "${ref}" ]] && continue
|
|
local ns="${ref%%/*}"
|
|
local name="${ref##*/}"
|
|
"${KUBECTL}" -n "${ns}" patch kustomization "${name}" --type=merge -p "${patch}" >/dev/null || true
|
|
done <<<"${SUSPENDED_KS_BEFORE}"
|
|
while read -r ref; do
|
|
[[ -z "${ref}" ]] && continue
|
|
local ns="${ref%%/*}"
|
|
local name="${ref##*/}"
|
|
"${KUBECTL}" -n "${ns}" patch helmrelease "${name}" --type=merge -p "${patch}" >/dev/null || true
|
|
done <<<"${SUSPENDED_HR_BEFORE}"
|
|
}
|
|
|
|
normalize_lines() {
|
|
sed '/^$/d' | sort
|
|
}
|
|
|
|
verify_flux_suspend_state_restored() {
|
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
|
log "plan: verify Flux suspended objects match pre-drill state"
|
|
return 0
|
|
fi
|
|
local current_ks current_hr
|
|
current_ks="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
|
|
current_hr="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
|
|
|
|
local expected_ks expected_hr got_ks got_hr
|
|
expected_ks="$(printf '%s\n' "${SUSPENDED_KS_BEFORE}" | normalize_lines)"
|
|
expected_hr="$(printf '%s\n' "${SUSPENDED_HR_BEFORE}" | normalize_lines)"
|
|
got_ks="$(printf '%s\n' "${current_ks}" | normalize_lines)"
|
|
got_hr="$(printf '%s\n' "${current_hr}" | normalize_lines)"
|
|
|
|
[[ "${got_ks}" == "${expected_ks}" ]] || die "kustomization suspend-state drift detected"
|
|
[[ "${got_hr}" == "${expected_hr}" ]] || die "helmrelease suspend-state drift detected"
|
|
}
|
|
|
|
write_log_header() {
|
|
local drill="$1"
|
|
mkdir -p "${LOG_DIR}"
|
|
local f="${LOG_DIR}/${drill}-$(now_ts).log"
|
|
exec > >(tee -a "${f}") 2>&1
|
|
log "drill=${drill} execute=${EXECUTE} coordinator=${HECATE_COORDINATOR_HOST}"
|
|
}
|
|
|
|
run_drill_flux_gitea_deadlock() {
|
|
CURRENT_RESOURCES=(
|
|
"flux-system|deployment|source-controller|1"
|
|
"flux-system|deployment|kustomize-controller|1"
|
|
"flux-system|deployment|helm-controller|1"
|
|
"flux-system|deployment|notification-controller|1"
|
|
"gitea|deployment|gitea|1"
|
|
)
|
|
snapshot_resources "${CURRENT_RESOURCES[@]}"
|
|
ROLLBACK_FLUX_SUSPEND=0
|
|
|
|
log "injecting outage: flux controllers + gitea"
|
|
for res in "${CURRENT_RESOURCES[@]}"; do
|
|
IFS='|' read -r ns kind name _ <<<"${res}"
|
|
scale_to "$ns" "$kind" "$name" 0
|
|
done
|
|
|
|
run_hecate_startup "drill-flux-gitea-deadlock"
|
|
|
|
log "verifying recovery"
|
|
wait_ready flux-system deployment source-controller 240s
|
|
wait_ready flux-system deployment kustomize-controller 240s
|
|
wait_ready flux-system deployment helm-controller 240s
|
|
wait_ready flux-system deployment notification-controller 240s
|
|
wait_ready gitea deployment gitea 300s
|
|
log "pass: flux-gitea-deadlock"
|
|
CURRENT_RESOURCES=()
|
|
}
|
|
|
|
run_drill_foundation_recovery() {
|
|
CURRENT_RESOURCES=(
|
|
"vault|statefulset|vault|1"
|
|
"postgres|statefulset|postgres|1"
|
|
"gitea|deployment|gitea|1"
|
|
)
|
|
snapshot_resources "${CURRENT_RESOURCES[@]}"
|
|
ROLLBACK_FLUX_SUSPEND=0
|
|
|
|
log "injecting outage: vault + postgres + gitea"
|
|
for res in "${CURRENT_RESOURCES[@]}"; do
|
|
IFS='|' read -r ns kind name _ <<<"${res}"
|
|
scale_to "$ns" "$kind" "$name" 0
|
|
done
|
|
|
|
run_hecate_startup "drill-foundation-recovery"
|
|
|
|
log "verifying layered recovery"
|
|
wait_ready vault statefulset vault 420s
|
|
wait_ready postgres statefulset postgres 420s
|
|
wait_ready gitea deployment gitea 300s
|
|
log "pass: foundation-recovery"
|
|
CURRENT_RESOURCES=()
|
|
}
|
|
|
|
run_drill_reconciliation_resume() {
|
|
CURRENT_RESOURCES=("flux-system|deployment|source-controller|1")
|
|
snapshot_resources "${CURRENT_RESOURCES[@]}"
|
|
record_flux_suspended_before
|
|
ROLLBACK_FLUX_SUSPEND=1
|
|
|
|
log "injecting outage: suspend all Flux objects + stop source-controller"
|
|
set_flux_suspend_all true
|
|
scale_to flux-system deployment source-controller 0
|
|
|
|
run_hecate_startup "drill-reconciliation-resume"
|
|
|
|
log "verifying reconciliation resumed"
|
|
wait_ready flux-system deployment source-controller 240s
|
|
verify_flux_suspend_state_restored
|
|
log "pass: reconciliation-resume"
|
|
CURRENT_RESOURCES=()
|
|
ROLLBACK_FLUX_SUSPEND=0
|
|
}
|
|
|
|
run_drill_startup_intent_guard() {
|
|
local intent_path="/var/lib/hecate/intent.json"
|
|
local backup_path="/tmp/hecate-intent-pre-drill.json"
|
|
local inject_cmd="
|
|
if [ -f '${intent_path}' ]; then sudo cp '${intent_path}' '${backup_path}'; else sudo rm -f '${backup_path}'; fi
|
|
cat <<'JSON' | sudo tee '${intent_path}' >/dev/null
|
|
{\"state\":\"shutting_down\",\"reason\":\"drill-intent-guard\",\"source\":\"drill\",\"updated_at\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}
|
|
JSON
|
|
"
|
|
local restore_cmd="
|
|
if [ -f '${backup_path}' ]; then
|
|
sudo mv '${backup_path}' '${intent_path}'
|
|
else
|
|
sudo rm -f '${intent_path}'
|
|
fi
|
|
"
|
|
local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason drill-startup-intent-guard"
|
|
|
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
|
log "plan: ssh ${HECATE_COORDINATOR_HOST} '<inject shutdown intent>'"
|
|
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${startup_cmd}' (expect failure)"
|
|
log "plan: ssh ${HECATE_COORDINATOR_HOST} '<restore prior intent>'"
|
|
log "pass: startup-intent-guard (plan mode)"
|
|
return 0
|
|
fi
|
|
|
|
ssh "${HECATE_COORDINATOR_HOST}" "bash -lc ${inject_cmd@Q}"
|
|
if ssh "${HECATE_COORDINATOR_HOST}" "bash -lc ${startup_cmd@Q}"; then
|
|
ssh "${HECATE_COORDINATOR_HOST}" "bash -lc ${restore_cmd@Q}" || true
|
|
die "startup-intent-guard failed: startup unexpectedly succeeded while shutdown intent was active"
|
|
fi
|
|
ssh "${HECATE_COORDINATOR_HOST}" "bash -lc ${restore_cmd@Q}"
|
|
log "pass: startup-intent-guard"
|
|
}
|
|
|
|
main() {
|
|
need_cmd "${KUBECTL}"
|
|
need_cmd ssh
|
|
need_cmd timeout
|
|
trap on_err ERR
|
|
|
|
local cmd="${1:-}"
|
|
case "${cmd}" in
|
|
list)
|
|
usage
|
|
exit 0
|
|
;;
|
|
run)
|
|
shift || true
|
|
local drill="${1:-}"
|
|
[[ -n "${drill}" ]] || die "missing drill name"
|
|
shift || true
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--execute) EXECUTE=1 ;;
|
|
*) die "unknown option: $1" ;;
|
|
esac
|
|
shift
|
|
done
|
|
write_log_header "${drill}"
|
|
CURRENT_DRILL="${drill}"
|
|
;;
|
|
*)
|
|
usage
|
|
exit 2
|
|
;;
|
|
esac
|
|
|
|
case "${drill}" in
|
|
flux-gitea-deadlock)
|
|
run_drill_flux_gitea_deadlock
|
|
;;
|
|
foundation-recovery)
|
|
run_drill_foundation_recovery
|
|
;;
|
|
reconciliation-resume)
|
|
run_drill_reconciliation_resume
|
|
;;
|
|
startup-intent-guard)
|
|
run_drill_startup_intent_guard
|
|
;;
|
|
*)
|
|
die "unknown drill: ${drill}"
|
|
;;
|
|
esac
|
|
}
|
|
|
|
main "$@"
|