2026-04-04 05:50:38 -03:00
#!/usr/bin/env bash
2026-04-04 05:59:12 -03:00
set -Eeuo pipefail
2026-04-04 05:50:38 -03:00
KUBECTL = " ${ KUBECTL :- kubectl } "
HECATE_COORDINATOR_HOST = " ${ HECATE_COORDINATOR_HOST :- titan -db } "
HECATE_BIN = " ${ HECATE_BIN :- /usr/local/bin/hecate } "
HECATE_CONFIG = " ${ HECATE_CONFIG :- /etc/hecate/hecate.yaml } "
2026-04-05 16:18:54 -03:00
HECATE_COORDINATOR_RELAY = " ${ HECATE_COORDINATOR_RELAY :- } "
2026-04-04 05:50:38 -03:00
LOG_DIR = " ${ HECATE_DRILL_LOG_DIR :- /tmp/hecate-drills } "
STARTUP_TIMEOUT_SECONDS = " ${ HECATE_DRILL_STARTUP_TIMEOUT_SECONDS :- 1800 } "
EXECUTE = 0
usage( ) {
cat <<'EOF'
Usage:
scripts/hecate-drills.sh list
scripts/hecate-drills.sh run <drill-name> [ --execute]
Drills:
flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery.
foundation-recovery Simulate vault/postgres/gitea outage and require layered restore.
reconciliation-resume Simulate global Flux suspend + source-controller down and require resume.
2026-04-04 12:44:15 -03:00
startup-intent-guard Assert startup is blocked when shutdown intent is active.
2026-04-04 05:50:38 -03:00
Notes:
- Drills are intentionally disruptive and are not part of regular ` make test ` .
- Use --execute to run live changes. Without it, this script prints planned actions only.
2026-04-05 16:18:54 -03:00
- Optional relay: set HECATE_COORDINATOR_RELAY = "ssh titan-db" to run coordinator commands via a jump host.
2026-04-04 05:50:38 -03:00
EOF
}
log( ) {
printf '[drill] %s\n' " $* "
}
die( ) {
printf '[drill] ERROR: %s\n' " $* " >& 2
exit 1
}
need_cmd( ) {
command -v " $1 " >/dev/null 2>& 1 || die " missing required command: $1 "
}
now_ts( ) {
date -u +%Y%m%dT%H%M%SZ
}
resource_key( ) {
local ns = " $1 " kind = " $2 " name = " $3 "
printf '%s|%s|%s' " $ns " " $kind " " $name "
}
get_replicas( ) {
local ns = " $1 " kind = " $2 " name = " $3 "
" ${ KUBECTL } " -n " $ns " get " $kind " " $name " -o jsonpath = '{.spec.replicas}' 2>/dev/null || echo "0"
}
scale_to( ) {
local ns = " $1 " kind = " $2 " name = " $3 " replicas = " $4 "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " plan: kubectl -n ${ ns } scale ${ kind } ${ name } --replicas= ${ replicas } "
return 0
fi
" ${ KUBECTL } " -n " $ns " scale " $kind " " $name " --replicas= " ${ replicas } " >/dev/null
}
wait_ready( ) {
local ns = " $1 " kind = " $2 " name = " $3 " timeout = " $4 "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " plan: kubectl -n ${ ns } rollout status ${ kind } / ${ name } --timeout= ${ timeout } "
return 0
fi
" ${ KUBECTL } " -n " $ns " rollout status " ${ kind } / ${ name } " --timeout= " ${ timeout } " >/dev/null
}
run_hecate_startup( ) {
local reason = " $1 "
2026-04-04 12:44:15 -03:00
local cmd = ( sudo " ${ HECATE_BIN } " startup --config " ${ HECATE_CONFIG } " --execute --force-flux-branch main --reason " ${ reason } " )
2026-04-04 05:50:38 -03:00
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
2026-04-05 16:18:54 -03:00
if [ [ -n " ${ HECATE_COORDINATOR_RELAY } " ] ] ; then
log " plan: ssh ${ HECATE_COORDINATOR_HOST } ${ HECATE_COORDINATOR_RELAY } ' ${ cmd [*] } ' "
else
log " plan: ssh ${ HECATE_COORDINATOR_HOST } ' ${ cmd [*] } ' "
fi
2026-04-04 05:50:38 -03:00
return 0
fi
2026-04-05 16:18:54 -03:00
if [ [ -n " ${ HECATE_COORDINATOR_RELAY } " ] ] ; then
# shellcheck disable=SC2086
timeout " ${ STARTUP_TIMEOUT_SECONDS } " ssh " ${ HECATE_COORDINATOR_HOST } " ${ HECATE_COORDINATOR_RELAY } " ${ cmd [@] } "
else
timeout " ${ STARTUP_TIMEOUT_SECONDS } " ssh " ${ HECATE_COORDINATOR_HOST } " " ${ cmd [@] } "
fi
}
run_coordinator_bash( ) {
local script = " $1 "
if [ [ -n " ${ HECATE_COORDINATOR_RELAY } " ] ] ; then
# shellcheck disable=SC2086
printf '%s\n' " ${ script } " | ssh " ${ HECATE_COORDINATOR_HOST } " ${ HECATE_COORDINATOR_RELAY } "bash -se"
else
printf '%s\n' " ${ script } " | ssh " ${ HECATE_COORDINATOR_HOST } " "bash -se"
fi
2026-04-04 05:50:38 -03:00
}
declare -A SNAPSHOT_REPLICAS = ( )
SUSPENDED_KS_BEFORE = ""
SUSPENDED_HR_BEFORE = ""
2026-04-04 05:59:12 -03:00
CURRENT_DRILL = ""
CURRENT_RESOURCES = ( )
ROLLBACK_FLUX_SUSPEND = 0
on_err( ) {
local code = $?
log " failure detected in drill ' ${ CURRENT_DRILL } ' (exit= ${ code } ); starting rollback "
if [ [ " ${ ROLLBACK_FLUX_SUSPEND } " -eq 1 ] ] ; then
restore_flux_suspended_before || true
fi
if [ [ ${# CURRENT_RESOURCES [@] } -gt 0 ] ] ; then
restore_resources " ${ CURRENT_RESOURCES [@] } " || true
fi
exit " ${ code } "
}
2026-04-04 05:50:38 -03:00
snapshot_resources( ) {
local resources = ( " $@ " )
SNAPSHOT_REPLICAS = ( )
for res in " ${ resources [@] } " ; do
IFS = '|' read -r ns kind name _ <<< " ${ res } "
SNAPSHOT_REPLICAS[ " $( resource_key " $ns " " $kind " " $name " ) " ] = " $( get_replicas " $ns " " $kind " " $name " ) "
done
}
restore_resources( ) {
local resources = ( " $@ " )
for res in " ${ resources [@] } " ; do
IFS = '|' read -r ns kind name _ <<< " ${ res } "
local key
key = " $( resource_key " $ns " " $kind " " $name " ) "
local replicas = " ${ SNAPSHOT_REPLICAS [ ${ key } ] :- 1 } "
log " rollback replicas: ${ ns } / ${ kind } / ${ name } -> ${ replicas } "
scale_to " $ns " " $kind " " $name " " $replicas " || true
done
}
record_flux_suspended_before( ) {
SUSPENDED_KS_BEFORE = " $( " ${ KUBECTL } " get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath = '{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true ) "
SUSPENDED_HR_BEFORE = " $( " ${ KUBECTL } " get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath = '{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true ) "
}
set_flux_suspend_all( ) {
local value = " $1 "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " plan: patch all Flux kustomizations + helmreleases suspend= ${ value } "
return 0
fi
local patch
patch = " $( printf '{"spec":{"suspend":%s}}' " ${ value } " ) "
while read -r ks; do
[ [ -z " ${ ks } " ] ] && continue
" ${ KUBECTL } " -n flux-system patch kustomization " ${ ks } " --type= merge -p " ${ patch } " >/dev/null || true
done < <( " ${ KUBECTL } " -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath = '{range .items[*]}{.metadata.name}{"\n"}{end}' )
while read -r hr; do
[ [ -z " ${ hr } " ] ] && continue
local ns = " ${ hr %%/* } "
local name = " ${ hr ##*/ } "
" ${ KUBECTL } " -n " ${ ns } " patch helmrelease " ${ name } " --type= merge -p " ${ patch } " >/dev/null || true
done < <( " ${ KUBECTL } " get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath = '{range .items[*]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' )
}
restore_flux_suspended_before( ) {
set_flux_suspend_all false
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
return 0
fi
local patch = '{"spec":{"suspend":true}}'
while read -r ref; do
[ [ -z " ${ ref } " ] ] && continue
local ns = " ${ ref %%/* } "
local name = " ${ ref ##*/ } "
" ${ KUBECTL } " -n " ${ ns } " patch kustomization " ${ name } " --type= merge -p " ${ patch } " >/dev/null || true
done <<< " ${ SUSPENDED_KS_BEFORE } "
while read -r ref; do
[ [ -z " ${ ref } " ] ] && continue
local ns = " ${ ref %%/* } "
local name = " ${ ref ##*/ } "
" ${ KUBECTL } " -n " ${ ns } " patch helmrelease " ${ name } " --type= merge -p " ${ patch } " >/dev/null || true
done <<< " ${ SUSPENDED_HR_BEFORE } "
}
2026-04-04 05:59:12 -03:00
normalize_lines( ) {
sed '/^$/d' | sort
}
verify_flux_suspend_state_restored( ) {
2026-04-04 05:50:38 -03:00
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
2026-04-04 05:59:12 -03:00
log "plan: verify Flux suspended objects match pre-drill state"
2026-04-04 05:50:38 -03:00
return 0
fi
2026-04-04 05:59:12 -03:00
local current_ks current_hr
current_ks = " $( " ${ KUBECTL } " get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath = '{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true ) "
current_hr = " $( " ${ KUBECTL } " get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath = '{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true ) "
local expected_ks expected_hr got_ks got_hr
expected_ks = " $( printf '%s\n' " ${ SUSPENDED_KS_BEFORE } " | normalize_lines) "
expected_hr = " $( printf '%s\n' " ${ SUSPENDED_HR_BEFORE } " | normalize_lines) "
got_ks = " $( printf '%s\n' " ${ current_ks } " | normalize_lines) "
got_hr = " $( printf '%s\n' " ${ current_hr } " | normalize_lines) "
[ [ " ${ got_ks } " = = " ${ expected_ks } " ] ] || die "kustomization suspend-state drift detected"
[ [ " ${ got_hr } " = = " ${ expected_hr } " ] ] || die "helmrelease suspend-state drift detected"
2026-04-04 05:50:38 -03:00
}
write_log_header( ) {
local drill = " $1 "
mkdir -p " ${ LOG_DIR } "
local f = " ${ LOG_DIR } / ${ drill } - $( now_ts) .log "
exec > >( tee -a " ${ f } " ) 2>& 1
log " drill= ${ drill } execute= ${ EXECUTE } coordinator= ${ HECATE_COORDINATOR_HOST } "
}
run_drill_flux_gitea_deadlock( ) {
2026-04-04 05:59:12 -03:00
CURRENT_RESOURCES = (
2026-04-04 05:50:38 -03:00
"flux-system|deployment|source-controller|1"
"flux-system|deployment|kustomize-controller|1"
"flux-system|deployment|helm-controller|1"
"flux-system|deployment|notification-controller|1"
"gitea|deployment|gitea|1"
)
2026-04-04 05:59:12 -03:00
snapshot_resources " ${ CURRENT_RESOURCES [@] } "
ROLLBACK_FLUX_SUSPEND = 0
2026-04-04 05:50:38 -03:00
log "injecting outage: flux controllers + gitea"
2026-04-04 05:59:12 -03:00
for res in " ${ CURRENT_RESOURCES [@] } " ; do
2026-04-04 05:50:38 -03:00
IFS = '|' read -r ns kind name _ <<< " ${ res } "
scale_to " $ns " " $kind " " $name " 0
done
run_hecate_startup "drill-flux-gitea-deadlock"
log "verifying recovery"
wait_ready flux-system deployment source-controller 240s
wait_ready flux-system deployment kustomize-controller 240s
wait_ready flux-system deployment helm-controller 240s
wait_ready flux-system deployment notification-controller 240s
wait_ready gitea deployment gitea 300s
log "pass: flux-gitea-deadlock"
2026-04-04 05:59:12 -03:00
CURRENT_RESOURCES = ( )
2026-04-04 05:50:38 -03:00
}
run_drill_foundation_recovery( ) {
2026-04-04 05:59:12 -03:00
CURRENT_RESOURCES = (
2026-04-04 05:50:38 -03:00
"vault|statefulset|vault|1"
"postgres|statefulset|postgres|1"
"gitea|deployment|gitea|1"
)
2026-04-04 05:59:12 -03:00
snapshot_resources " ${ CURRENT_RESOURCES [@] } "
ROLLBACK_FLUX_SUSPEND = 0
2026-04-04 05:50:38 -03:00
log "injecting outage: vault + postgres + gitea"
2026-04-04 05:59:12 -03:00
for res in " ${ CURRENT_RESOURCES [@] } " ; do
2026-04-04 05:50:38 -03:00
IFS = '|' read -r ns kind name _ <<< " ${ res } "
scale_to " $ns " " $kind " " $name " 0
done
run_hecate_startup "drill-foundation-recovery"
log "verifying layered recovery"
wait_ready vault statefulset vault 420s
wait_ready postgres statefulset postgres 420s
wait_ready gitea deployment gitea 300s
log "pass: foundation-recovery"
2026-04-04 05:59:12 -03:00
CURRENT_RESOURCES = ( )
2026-04-04 05:50:38 -03:00
}
run_drill_reconciliation_resume( ) {
2026-04-04 05:59:12 -03:00
CURRENT_RESOURCES = ( "flux-system|deployment|source-controller|1" )
snapshot_resources " ${ CURRENT_RESOURCES [@] } "
2026-04-04 05:50:38 -03:00
record_flux_suspended_before
2026-04-04 05:59:12 -03:00
ROLLBACK_FLUX_SUSPEND = 1
2026-04-04 05:50:38 -03:00
log "injecting outage: suspend all Flux objects + stop source-controller"
set_flux_suspend_all true
scale_to flux-system deployment source-controller 0
run_hecate_startup "drill-reconciliation-resume"
log "verifying reconciliation resumed"
wait_ready flux-system deployment source-controller 240s
2026-04-04 05:59:12 -03:00
verify_flux_suspend_state_restored
2026-04-04 05:50:38 -03:00
log "pass: reconciliation-resume"
2026-04-04 05:59:12 -03:00
CURRENT_RESOURCES = ( )
ROLLBACK_FLUX_SUSPEND = 0
2026-04-04 05:50:38 -03:00
}
2026-04-04 12:44:15 -03:00
run_drill_startup_intent_guard( ) {
local intent_path = "/var/lib/hecate/intent.json"
local backup_path = "/tmp/hecate-intent-pre-drill.json"
local inject_cmd = "
if [ -f '${intent_path}' ] ; then sudo cp '${intent_path}' '${backup_path}' ; else sudo rm -f '${backup_path}' ; fi
cat <<'JSON' | sudo tee '${intent_path}' >/dev/null
{ \" state\" :\" shutting_down\" ,\" reason\" :\" drill-intent-guard\" ,\" source\" :\" drill\" ,\" updated_at\" :\" $( date -u +%Y-%m-%dT%H:%M:%SZ) \" }
JSON
"
local restore_cmd = "
if [ -f '${backup_path}' ] ; then
sudo mv '${backup_path}' '${intent_path}'
else
sudo rm -f '${intent_path}'
fi
"
local startup_cmd = " sudo ${ HECATE_BIN } startup --config ${ HECATE_CONFIG } --execute --force-flux-branch main --reason drill-startup-intent-guard "
if [ [ " ${ EXECUTE } " -eq 0 ] ] ; then
log " plan: ssh ${ HECATE_COORDINATOR_HOST } '<inject shutdown intent>' "
log " plan: ssh ${ HECATE_COORDINATOR_HOST } ' ${ startup_cmd } ' (expect failure) "
log " plan: ssh ${ HECATE_COORDINATOR_HOST } '<restore prior intent>' "
log "pass: startup-intent-guard (plan mode)"
return 0
fi
2026-04-05 16:18:54 -03:00
run_coordinator_bash " ${ inject_cmd } "
if run_coordinator_bash " ${ startup_cmd } " ; then
run_coordinator_bash " ${ restore_cmd } " || true
2026-04-04 12:44:15 -03:00
die "startup-intent-guard failed: startup unexpectedly succeeded while shutdown intent was active"
fi
2026-04-05 16:18:54 -03:00
run_coordinator_bash " ${ restore_cmd } "
2026-04-04 12:44:15 -03:00
log "pass: startup-intent-guard"
}
2026-04-04 05:50:38 -03:00
main( ) {
need_cmd " ${ KUBECTL } "
need_cmd ssh
need_cmd timeout
2026-04-04 05:59:12 -03:00
trap on_err ERR
2026-04-04 05:50:38 -03:00
local cmd = " ${ 1 :- } "
case " ${ cmd } " in
list)
usage
exit 0
; ;
run)
shift || true
local drill = " ${ 1 :- } "
[ [ -n " ${ drill } " ] ] || die "missing drill name"
shift || true
while [ [ $# -gt 0 ] ] ; do
case " $1 " in
--execute) EXECUTE = 1 ; ;
*) die " unknown option: $1 " ; ;
esac
shift
done
write_log_header " ${ drill } "
2026-04-04 05:59:12 -03:00
CURRENT_DRILL = " ${ drill } "
2026-04-04 05:50:38 -03:00
; ;
*)
usage
exit 2
; ;
esac
case " ${ drill } " in
flux-gitea-deadlock)
run_drill_flux_gitea_deadlock
; ;
foundation-recovery)
run_drill_foundation_recovery
; ;
reconciliation-resume)
run_drill_reconciliation_resume
; ;
2026-04-04 12:44:15 -03:00
startup-intent-guard)
run_drill_startup_intent_guard
; ;
2026-04-04 05:50:38 -03:00
*)
die " unknown drill: ${ drill } "
; ;
esac
}
main " $@ "