From 014039eea27fee4d63370f2a9d20f9fff46b03e8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 4 Apr 2026 05:59:12 -0300 Subject: [PATCH] hecate: harden startup recovery and drill rollback safety --- internal/cluster/orchestrator.go | 40 ++++++++--------- scripts/hecate-drills.sh | 75 ++++++++++++++++++++++---------- 2 files changed, 70 insertions(+), 45 deletions(-) diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index 226bb8f..5c86add 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -91,44 +91,40 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er } } - needsBootstrap := false + needsLocalBootstrap := false bootstrapReasons := []string{} if !opts.SkipLocalBootstrap { ready, readyErr := o.fluxSourceReady(ctx) if readyErr != nil { o.log.Printf("warning: unable to read flux source readiness: %v", readyErr) - needsBootstrap = true + needsLocalBootstrap = true bootstrapReasons = append(bootstrapReasons, "flux source readiness check failed") } if !ready { - needsBootstrap = true + needsLocalBootstrap = true bootstrapReasons = append(bootstrapReasons, "flux source not ready") } - - missing, missingErr := o.missingCriticalStartupWorkloads(ctx) - if missingErr != nil { - o.log.Printf("warning: unable to inspect critical startup workloads: %v", missingErr) - needsBootstrap = true - bootstrapReasons = append(bootstrapReasons, "critical workload readiness check failed") - } - if len(missing) > 0 { - needsBootstrap = true - bootstrapReasons = append(bootstrapReasons, "critical workloads not ready: "+strings.Join(missing, ", ")) - } - - if needsBootstrap { - o.log.Printf("startup bootstrap required: %s", strings.Join(bootstrapReasons, "; ")) - if err := o.bootstrapLocal(ctx); err != nil { - return err - } - } } + missing, missingErr := o.missingCriticalStartupWorkloads(ctx) + if missingErr != nil { + o.log.Printf("warning: unable to inspect critical startup workloads: %v", missingErr) + } + if len(missing) > 0 { + o.log.Printf("startup critical workloads not ready; applying targeted recovery first: %s", strings.Join(missing, ", ")) + } if err := o.ensureCriticalStartupWorkloads(ctx); err != nil { return err } - if !opts.SkipLocalBootstrap && needsBootstrap { + if !opts.SkipLocalBootstrap && needsLocalBootstrap { + o.log.Printf("startup bootstrap required: %s", strings.Join(bootstrapReasons, "; ")) + if err := o.bootstrapLocal(ctx); err != nil { + return err + } + if err := o.ensureCriticalStartupWorkloads(ctx); err != nil { + return err + } ready, err := o.fluxSourceReady(ctx) if err != nil { return fmt.Errorf("flux source readiness after bootstrap: %w", err) diff --git a/scripts/hecate-drills.sh b/scripts/hecate-drills.sh index f9865e9..e1f8f91 100755 --- a/scripts/hecate-drills.sh +++ b/scripts/hecate-drills.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -set -euo pipefail +set -Eeuo pipefail KUBECTL="${KUBECTL:-kubectl}" HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}" @@ -84,6 +84,21 @@ run_hecate_startup() { declare -A SNAPSHOT_REPLICAS=() SUSPENDED_KS_BEFORE="" SUSPENDED_HR_BEFORE="" +CURRENT_DRILL="" +CURRENT_RESOURCES=() +ROLLBACK_FLUX_SUSPEND=0 + +on_err() { + local code=$? + log "failure detected in drill '${CURRENT_DRILL}' (exit=${code}); starting rollback" + if [[ "${ROLLBACK_FLUX_SUSPEND}" -eq 1 ]]; then + restore_flux_suspended_before || true + fi + if [[ ${#CURRENT_RESOURCES[@]} -gt 0 ]]; then + restore_resources "${CURRENT_RESOURCES[@]}" || true + fi + exit "${code}" +} snapshot_resources() { local resources=("$@") @@ -152,16 +167,27 @@ restore_flux_suspended_before() { done <<<"${SUSPENDED_HR_BEFORE}" } -verify_flux_unsuspended() { +normalize_lines() { + sed '/^$/d' | sort +} + +verify_flux_suspend_state_restored() { if [[ "${EXECUTE}" -eq 0 ]]; then - log "plan: verify no Flux kustomizations/helmreleases remain suspended" + log "plan: verify Flux suspended objects match pre-drill state" return 0 fi - local ks_count hr_count - ks_count="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')" - hr_count="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')" - [[ "${ks_count}" == "0" ]] || die "kustomizations still suspended: ${ks_count}" - [[ "${hr_count}" == "0" ]] || die "helmreleases still suspended: ${hr_count}" + local current_ks current_hr + current_ks="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)" + current_hr="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)" + + local expected_ks expected_hr got_ks got_hr + expected_ks="$(printf '%s\n' "${SUSPENDED_KS_BEFORE}" | normalize_lines)" + expected_hr="$(printf '%s\n' "${SUSPENDED_HR_BEFORE}" | normalize_lines)" + got_ks="$(printf '%s\n' "${current_ks}" | normalize_lines)" + got_hr="$(printf '%s\n' "${current_hr}" | normalize_lines)" + + [[ "${got_ks}" == "${expected_ks}" ]] || die "kustomization suspend-state drift detected" + [[ "${got_hr}" == "${expected_hr}" ]] || die "helmrelease suspend-state drift detected" } write_log_header() { @@ -173,18 +199,18 @@ write_log_header() { } run_drill_flux_gitea_deadlock() { - local resources=( + CURRENT_RESOURCES=( "flux-system|deployment|source-controller|1" "flux-system|deployment|kustomize-controller|1" "flux-system|deployment|helm-controller|1" "flux-system|deployment|notification-controller|1" "gitea|deployment|gitea|1" ) - snapshot_resources "${resources[@]}" - trap 'restore_resources "${resources[@]}"' ERR + snapshot_resources "${CURRENT_RESOURCES[@]}" + ROLLBACK_FLUX_SUSPEND=0 log "injecting outage: flux controllers + gitea" - for res in "${resources[@]}"; do + for res in "${CURRENT_RESOURCES[@]}"; do IFS='|' read -r ns kind name _ <<<"${res}" scale_to "$ns" "$kind" "$name" 0 done @@ -198,20 +224,20 @@ run_drill_flux_gitea_deadlock() { wait_ready flux-system deployment notification-controller 240s wait_ready gitea deployment gitea 300s log "pass: flux-gitea-deadlock" - trap - ERR + CURRENT_RESOURCES=() } run_drill_foundation_recovery() { - local resources=( + CURRENT_RESOURCES=( "vault|statefulset|vault|1" "postgres|statefulset|postgres|1" "gitea|deployment|gitea|1" ) - snapshot_resources "${resources[@]}" - trap 'restore_resources "${resources[@]}"' ERR + snapshot_resources "${CURRENT_RESOURCES[@]}" + ROLLBACK_FLUX_SUSPEND=0 log "injecting outage: vault + postgres + gitea" - for res in "${resources[@]}"; do + for res in "${CURRENT_RESOURCES[@]}"; do IFS='|' read -r ns kind name _ <<<"${res}" scale_to "$ns" "$kind" "$name" 0 done @@ -223,14 +249,14 @@ run_drill_foundation_recovery() { wait_ready postgres statefulset postgres 420s wait_ready gitea deployment gitea 300s log "pass: foundation-recovery" - trap - ERR + CURRENT_RESOURCES=() } run_drill_reconciliation_resume() { - local resources=("flux-system|deployment|source-controller|1") - snapshot_resources "${resources[@]}" + CURRENT_RESOURCES=("flux-system|deployment|source-controller|1") + snapshot_resources "${CURRENT_RESOURCES[@]}" record_flux_suspended_before - trap 'restore_flux_suspended_before; restore_resources "${resources[@]}"' ERR + ROLLBACK_FLUX_SUSPEND=1 log "injecting outage: suspend all Flux objects + stop source-controller" set_flux_suspend_all true @@ -240,15 +266,17 @@ run_drill_reconciliation_resume() { log "verifying reconciliation resumed" wait_ready flux-system deployment source-controller 240s - verify_flux_unsuspended + verify_flux_suspend_state_restored log "pass: reconciliation-resume" - trap - ERR + CURRENT_RESOURCES=() + ROLLBACK_FLUX_SUSPEND=0 } main() { need_cmd "${KUBECTL}" need_cmd ssh need_cmd timeout + trap on_err ERR local cmd="${1:-}" case "${cmd}" in @@ -269,6 +297,7 @@ main() { shift done write_log_header "${drill}" + CURRENT_DRILL="${drill}" ;; *) usage