hecate: harden startup recovery and drill rollback safety

This commit is contained in:
Brad Stein 2026-04-04 05:59:12 -03:00
parent aa9c7b69f3
commit 014039eea2
2 changed files with 70 additions and 45 deletions

View File

@ -91,44 +91,40 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
} }
} }
needsBootstrap := false needsLocalBootstrap := false
bootstrapReasons := []string{} bootstrapReasons := []string{}
if !opts.SkipLocalBootstrap { if !opts.SkipLocalBootstrap {
ready, readyErr := o.fluxSourceReady(ctx) ready, readyErr := o.fluxSourceReady(ctx)
if readyErr != nil { if readyErr != nil {
o.log.Printf("warning: unable to read flux source readiness: %v", readyErr) o.log.Printf("warning: unable to read flux source readiness: %v", readyErr)
needsBootstrap = true needsLocalBootstrap = true
bootstrapReasons = append(bootstrapReasons, "flux source readiness check failed") bootstrapReasons = append(bootstrapReasons, "flux source readiness check failed")
} }
if !ready { if !ready {
needsBootstrap = true needsLocalBootstrap = true
bootstrapReasons = append(bootstrapReasons, "flux source not ready") bootstrapReasons = append(bootstrapReasons, "flux source not ready")
} }
missing, missingErr := o.missingCriticalStartupWorkloads(ctx)
if missingErr != nil {
o.log.Printf("warning: unable to inspect critical startup workloads: %v", missingErr)
needsBootstrap = true
bootstrapReasons = append(bootstrapReasons, "critical workload readiness check failed")
}
if len(missing) > 0 {
needsBootstrap = true
bootstrapReasons = append(bootstrapReasons, "critical workloads not ready: "+strings.Join(missing, ", "))
}
if needsBootstrap {
o.log.Printf("startup bootstrap required: %s", strings.Join(bootstrapReasons, "; "))
if err := o.bootstrapLocal(ctx); err != nil {
return err
}
}
} }
missing, missingErr := o.missingCriticalStartupWorkloads(ctx)
if missingErr != nil {
o.log.Printf("warning: unable to inspect critical startup workloads: %v", missingErr)
}
if len(missing) > 0 {
o.log.Printf("startup critical workloads not ready; applying targeted recovery first: %s", strings.Join(missing, ", "))
}
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil { if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
return err return err
} }
if !opts.SkipLocalBootstrap && needsBootstrap { if !opts.SkipLocalBootstrap && needsLocalBootstrap {
o.log.Printf("startup bootstrap required: %s", strings.Join(bootstrapReasons, "; "))
if err := o.bootstrapLocal(ctx); err != nil {
return err
}
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
return err
}
ready, err := o.fluxSourceReady(ctx) ready, err := o.fluxSourceReady(ctx)
if err != nil { if err != nil {
return fmt.Errorf("flux source readiness after bootstrap: %w", err) return fmt.Errorf("flux source readiness after bootstrap: %w", err)

View File

@ -1,5 +1,5 @@
#!/usr/bin/env bash #!/usr/bin/env bash
set -euo pipefail set -Eeuo pipefail
KUBECTL="${KUBECTL:-kubectl}" KUBECTL="${KUBECTL:-kubectl}"
HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}" HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}"
@ -84,6 +84,21 @@ run_hecate_startup() {
declare -A SNAPSHOT_REPLICAS=() declare -A SNAPSHOT_REPLICAS=()
SUSPENDED_KS_BEFORE="" SUSPENDED_KS_BEFORE=""
SUSPENDED_HR_BEFORE="" SUSPENDED_HR_BEFORE=""
CURRENT_DRILL=""
CURRENT_RESOURCES=()
ROLLBACK_FLUX_SUSPEND=0
on_err() {
local code=$?
log "failure detected in drill '${CURRENT_DRILL}' (exit=${code}); starting rollback"
if [[ "${ROLLBACK_FLUX_SUSPEND}" -eq 1 ]]; then
restore_flux_suspended_before || true
fi
if [[ ${#CURRENT_RESOURCES[@]} -gt 0 ]]; then
restore_resources "${CURRENT_RESOURCES[@]}" || true
fi
exit "${code}"
}
snapshot_resources() { snapshot_resources() {
local resources=("$@") local resources=("$@")
@ -152,16 +167,27 @@ restore_flux_suspended_before() {
done <<<"${SUSPENDED_HR_BEFORE}" done <<<"${SUSPENDED_HR_BEFORE}"
} }
verify_flux_unsuspended() { normalize_lines() {
sed '/^$/d' | sort
}
verify_flux_suspend_state_restored() {
if [[ "${EXECUTE}" -eq 0 ]]; then if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: verify no Flux kustomizations/helmreleases remain suspended" log "plan: verify Flux suspended objects match pre-drill state"
return 0 return 0
fi fi
local ks_count hr_count local current_ks current_hr
ks_count="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')" current_ks="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
hr_count="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')" current_hr="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
[[ "${ks_count}" == "0" ]] || die "kustomizations still suspended: ${ks_count}"
[[ "${hr_count}" == "0" ]] || die "helmreleases still suspended: ${hr_count}" local expected_ks expected_hr got_ks got_hr
expected_ks="$(printf '%s\n' "${SUSPENDED_KS_BEFORE}" | normalize_lines)"
expected_hr="$(printf '%s\n' "${SUSPENDED_HR_BEFORE}" | normalize_lines)"
got_ks="$(printf '%s\n' "${current_ks}" | normalize_lines)"
got_hr="$(printf '%s\n' "${current_hr}" | normalize_lines)"
[[ "${got_ks}" == "${expected_ks}" ]] || die "kustomization suspend-state drift detected"
[[ "${got_hr}" == "${expected_hr}" ]] || die "helmrelease suspend-state drift detected"
} }
write_log_header() { write_log_header() {
@ -173,18 +199,18 @@ write_log_header() {
} }
run_drill_flux_gitea_deadlock() { run_drill_flux_gitea_deadlock() {
local resources=( CURRENT_RESOURCES=(
"flux-system|deployment|source-controller|1" "flux-system|deployment|source-controller|1"
"flux-system|deployment|kustomize-controller|1" "flux-system|deployment|kustomize-controller|1"
"flux-system|deployment|helm-controller|1" "flux-system|deployment|helm-controller|1"
"flux-system|deployment|notification-controller|1" "flux-system|deployment|notification-controller|1"
"gitea|deployment|gitea|1" "gitea|deployment|gitea|1"
) )
snapshot_resources "${resources[@]}" snapshot_resources "${CURRENT_RESOURCES[@]}"
trap 'restore_resources "${resources[@]}"' ERR ROLLBACK_FLUX_SUSPEND=0
log "injecting outage: flux controllers + gitea" log "injecting outage: flux controllers + gitea"
for res in "${resources[@]}"; do for res in "${CURRENT_RESOURCES[@]}"; do
IFS='|' read -r ns kind name _ <<<"${res}" IFS='|' read -r ns kind name _ <<<"${res}"
scale_to "$ns" "$kind" "$name" 0 scale_to "$ns" "$kind" "$name" 0
done done
@ -198,20 +224,20 @@ run_drill_flux_gitea_deadlock() {
wait_ready flux-system deployment notification-controller 240s wait_ready flux-system deployment notification-controller 240s
wait_ready gitea deployment gitea 300s wait_ready gitea deployment gitea 300s
log "pass: flux-gitea-deadlock" log "pass: flux-gitea-deadlock"
trap - ERR CURRENT_RESOURCES=()
} }
run_drill_foundation_recovery() { run_drill_foundation_recovery() {
local resources=( CURRENT_RESOURCES=(
"vault|statefulset|vault|1" "vault|statefulset|vault|1"
"postgres|statefulset|postgres|1" "postgres|statefulset|postgres|1"
"gitea|deployment|gitea|1" "gitea|deployment|gitea|1"
) )
snapshot_resources "${resources[@]}" snapshot_resources "${CURRENT_RESOURCES[@]}"
trap 'restore_resources "${resources[@]}"' ERR ROLLBACK_FLUX_SUSPEND=0
log "injecting outage: vault + postgres + gitea" log "injecting outage: vault + postgres + gitea"
for res in "${resources[@]}"; do for res in "${CURRENT_RESOURCES[@]}"; do
IFS='|' read -r ns kind name _ <<<"${res}" IFS='|' read -r ns kind name _ <<<"${res}"
scale_to "$ns" "$kind" "$name" 0 scale_to "$ns" "$kind" "$name" 0
done done
@ -223,14 +249,14 @@ run_drill_foundation_recovery() {
wait_ready postgres statefulset postgres 420s wait_ready postgres statefulset postgres 420s
wait_ready gitea deployment gitea 300s wait_ready gitea deployment gitea 300s
log "pass: foundation-recovery" log "pass: foundation-recovery"
trap - ERR CURRENT_RESOURCES=()
} }
run_drill_reconciliation_resume() { run_drill_reconciliation_resume() {
local resources=("flux-system|deployment|source-controller|1") CURRENT_RESOURCES=("flux-system|deployment|source-controller|1")
snapshot_resources "${resources[@]}" snapshot_resources "${CURRENT_RESOURCES[@]}"
record_flux_suspended_before record_flux_suspended_before
trap 'restore_flux_suspended_before; restore_resources "${resources[@]}"' ERR ROLLBACK_FLUX_SUSPEND=1
log "injecting outage: suspend all Flux objects + stop source-controller" log "injecting outage: suspend all Flux objects + stop source-controller"
set_flux_suspend_all true set_flux_suspend_all true
@ -240,15 +266,17 @@ run_drill_reconciliation_resume() {
log "verifying reconciliation resumed" log "verifying reconciliation resumed"
wait_ready flux-system deployment source-controller 240s wait_ready flux-system deployment source-controller 240s
verify_flux_unsuspended verify_flux_suspend_state_restored
log "pass: reconciliation-resume" log "pass: reconciliation-resume"
trap - ERR CURRENT_RESOURCES=()
ROLLBACK_FLUX_SUSPEND=0
} }
main() { main() {
need_cmd "${KUBECTL}" need_cmd "${KUBECTL}"
need_cmd ssh need_cmd ssh
need_cmd timeout need_cmd timeout
trap on_err ERR
local cmd="${1:-}" local cmd="${1:-}"
case "${cmd}" in case "${cmd}" in
@ -269,6 +297,7 @@ main() {
shift shift
done done
write_log_header "${drill}" write_log_header "${drill}"
CURRENT_DRILL="${drill}"
;; ;;
*) *)
usage usage