hecate: harden startup recovery and drill rollback safety
This commit is contained in:
parent
aa9c7b69f3
commit
014039eea2
@ -91,44 +91,40 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
}
|
||||
}
|
||||
|
||||
needsBootstrap := false
|
||||
needsLocalBootstrap := false
|
||||
bootstrapReasons := []string{}
|
||||
if !opts.SkipLocalBootstrap {
|
||||
ready, readyErr := o.fluxSourceReady(ctx)
|
||||
if readyErr != nil {
|
||||
o.log.Printf("warning: unable to read flux source readiness: %v", readyErr)
|
||||
needsBootstrap = true
|
||||
needsLocalBootstrap = true
|
||||
bootstrapReasons = append(bootstrapReasons, "flux source readiness check failed")
|
||||
}
|
||||
if !ready {
|
||||
needsBootstrap = true
|
||||
needsLocalBootstrap = true
|
||||
bootstrapReasons = append(bootstrapReasons, "flux source not ready")
|
||||
}
|
||||
|
||||
missing, missingErr := o.missingCriticalStartupWorkloads(ctx)
|
||||
if missingErr != nil {
|
||||
o.log.Printf("warning: unable to inspect critical startup workloads: %v", missingErr)
|
||||
needsBootstrap = true
|
||||
bootstrapReasons = append(bootstrapReasons, "critical workload readiness check failed")
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
needsBootstrap = true
|
||||
bootstrapReasons = append(bootstrapReasons, "critical workloads not ready: "+strings.Join(missing, ", "))
|
||||
}
|
||||
|
||||
if needsBootstrap {
|
||||
o.log.Printf("startup bootstrap required: %s", strings.Join(bootstrapReasons, "; "))
|
||||
if err := o.bootstrapLocal(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
missing, missingErr := o.missingCriticalStartupWorkloads(ctx)
|
||||
if missingErr != nil {
|
||||
o.log.Printf("warning: unable to inspect critical startup workloads: %v", missingErr)
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
o.log.Printf("startup critical workloads not ready; applying targeted recovery first: %s", strings.Join(missing, ", "))
|
||||
}
|
||||
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !opts.SkipLocalBootstrap && needsBootstrap {
|
||||
if !opts.SkipLocalBootstrap && needsLocalBootstrap {
|
||||
o.log.Printf("startup bootstrap required: %s", strings.Join(bootstrapReasons, "; "))
|
||||
if err := o.bootstrapLocal(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
ready, err := o.fluxSourceReady(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("flux source readiness after bootstrap: %w", err)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
set -Eeuo pipefail
|
||||
|
||||
KUBECTL="${KUBECTL:-kubectl}"
|
||||
HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}"
|
||||
@ -84,6 +84,21 @@ run_hecate_startup() {
|
||||
declare -A SNAPSHOT_REPLICAS=()
|
||||
SUSPENDED_KS_BEFORE=""
|
||||
SUSPENDED_HR_BEFORE=""
|
||||
CURRENT_DRILL=""
|
||||
CURRENT_RESOURCES=()
|
||||
ROLLBACK_FLUX_SUSPEND=0
|
||||
|
||||
on_err() {
|
||||
local code=$?
|
||||
log "failure detected in drill '${CURRENT_DRILL}' (exit=${code}); starting rollback"
|
||||
if [[ "${ROLLBACK_FLUX_SUSPEND}" -eq 1 ]]; then
|
||||
restore_flux_suspended_before || true
|
||||
fi
|
||||
if [[ ${#CURRENT_RESOURCES[@]} -gt 0 ]]; then
|
||||
restore_resources "${CURRENT_RESOURCES[@]}" || true
|
||||
fi
|
||||
exit "${code}"
|
||||
}
|
||||
|
||||
snapshot_resources() {
|
||||
local resources=("$@")
|
||||
@ -152,16 +167,27 @@ restore_flux_suspended_before() {
|
||||
done <<<"${SUSPENDED_HR_BEFORE}"
|
||||
}
|
||||
|
||||
verify_flux_unsuspended() {
|
||||
normalize_lines() {
|
||||
sed '/^$/d' | sort
|
||||
}
|
||||
|
||||
verify_flux_suspend_state_restored() {
|
||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||
log "plan: verify no Flux kustomizations/helmreleases remain suspended"
|
||||
log "plan: verify Flux suspended objects match pre-drill state"
|
||||
return 0
|
||||
fi
|
||||
local ks_count hr_count
|
||||
ks_count="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')"
|
||||
hr_count="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')"
|
||||
[[ "${ks_count}" == "0" ]] || die "kustomizations still suspended: ${ks_count}"
|
||||
[[ "${hr_count}" == "0" ]] || die "helmreleases still suspended: ${hr_count}"
|
||||
local current_ks current_hr
|
||||
current_ks="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
|
||||
current_hr="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
|
||||
|
||||
local expected_ks expected_hr got_ks got_hr
|
||||
expected_ks="$(printf '%s\n' "${SUSPENDED_KS_BEFORE}" | normalize_lines)"
|
||||
expected_hr="$(printf '%s\n' "${SUSPENDED_HR_BEFORE}" | normalize_lines)"
|
||||
got_ks="$(printf '%s\n' "${current_ks}" | normalize_lines)"
|
||||
got_hr="$(printf '%s\n' "${current_hr}" | normalize_lines)"
|
||||
|
||||
[[ "${got_ks}" == "${expected_ks}" ]] || die "kustomization suspend-state drift detected"
|
||||
[[ "${got_hr}" == "${expected_hr}" ]] || die "helmrelease suspend-state drift detected"
|
||||
}
|
||||
|
||||
write_log_header() {
|
||||
@ -173,18 +199,18 @@ write_log_header() {
|
||||
}
|
||||
|
||||
run_drill_flux_gitea_deadlock() {
|
||||
local resources=(
|
||||
CURRENT_RESOURCES=(
|
||||
"flux-system|deployment|source-controller|1"
|
||||
"flux-system|deployment|kustomize-controller|1"
|
||||
"flux-system|deployment|helm-controller|1"
|
||||
"flux-system|deployment|notification-controller|1"
|
||||
"gitea|deployment|gitea|1"
|
||||
)
|
||||
snapshot_resources "${resources[@]}"
|
||||
trap 'restore_resources "${resources[@]}"' ERR
|
||||
snapshot_resources "${CURRENT_RESOURCES[@]}"
|
||||
ROLLBACK_FLUX_SUSPEND=0
|
||||
|
||||
log "injecting outage: flux controllers + gitea"
|
||||
for res in "${resources[@]}"; do
|
||||
for res in "${CURRENT_RESOURCES[@]}"; do
|
||||
IFS='|' read -r ns kind name _ <<<"${res}"
|
||||
scale_to "$ns" "$kind" "$name" 0
|
||||
done
|
||||
@ -198,20 +224,20 @@ run_drill_flux_gitea_deadlock() {
|
||||
wait_ready flux-system deployment notification-controller 240s
|
||||
wait_ready gitea deployment gitea 300s
|
||||
log "pass: flux-gitea-deadlock"
|
||||
trap - ERR
|
||||
CURRENT_RESOURCES=()
|
||||
}
|
||||
|
||||
run_drill_foundation_recovery() {
|
||||
local resources=(
|
||||
CURRENT_RESOURCES=(
|
||||
"vault|statefulset|vault|1"
|
||||
"postgres|statefulset|postgres|1"
|
||||
"gitea|deployment|gitea|1"
|
||||
)
|
||||
snapshot_resources "${resources[@]}"
|
||||
trap 'restore_resources "${resources[@]}"' ERR
|
||||
snapshot_resources "${CURRENT_RESOURCES[@]}"
|
||||
ROLLBACK_FLUX_SUSPEND=0
|
||||
|
||||
log "injecting outage: vault + postgres + gitea"
|
||||
for res in "${resources[@]}"; do
|
||||
for res in "${CURRENT_RESOURCES[@]}"; do
|
||||
IFS='|' read -r ns kind name _ <<<"${res}"
|
||||
scale_to "$ns" "$kind" "$name" 0
|
||||
done
|
||||
@ -223,14 +249,14 @@ run_drill_foundation_recovery() {
|
||||
wait_ready postgres statefulset postgres 420s
|
||||
wait_ready gitea deployment gitea 300s
|
||||
log "pass: foundation-recovery"
|
||||
trap - ERR
|
||||
CURRENT_RESOURCES=()
|
||||
}
|
||||
|
||||
run_drill_reconciliation_resume() {
|
||||
local resources=("flux-system|deployment|source-controller|1")
|
||||
snapshot_resources "${resources[@]}"
|
||||
CURRENT_RESOURCES=("flux-system|deployment|source-controller|1")
|
||||
snapshot_resources "${CURRENT_RESOURCES[@]}"
|
||||
record_flux_suspended_before
|
||||
trap 'restore_flux_suspended_before; restore_resources "${resources[@]}"' ERR
|
||||
ROLLBACK_FLUX_SUSPEND=1
|
||||
|
||||
log "injecting outage: suspend all Flux objects + stop source-controller"
|
||||
set_flux_suspend_all true
|
||||
@ -240,15 +266,17 @@ run_drill_reconciliation_resume() {
|
||||
|
||||
log "verifying reconciliation resumed"
|
||||
wait_ready flux-system deployment source-controller 240s
|
||||
verify_flux_unsuspended
|
||||
verify_flux_suspend_state_restored
|
||||
log "pass: reconciliation-resume"
|
||||
trap - ERR
|
||||
CURRENT_RESOURCES=()
|
||||
ROLLBACK_FLUX_SUSPEND=0
|
||||
}
|
||||
|
||||
main() {
|
||||
need_cmd "${KUBECTL}"
|
||||
need_cmd ssh
|
||||
need_cmd timeout
|
||||
trap on_err ERR
|
||||
|
||||
local cmd="${1:-}"
|
||||
case "${cmd}" in
|
||||
@ -269,6 +297,7 @@ main() {
|
||||
shift
|
||||
done
|
||||
write_log_header "${drill}"
|
||||
CURRENT_DRILL="${drill}"
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user