hecate: harden startup recovery and drill rollback safety
This commit is contained in:
parent
aa9c7b69f3
commit
014039eea2
@ -91,44 +91,40 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
needsBootstrap := false
|
needsLocalBootstrap := false
|
||||||
bootstrapReasons := []string{}
|
bootstrapReasons := []string{}
|
||||||
if !opts.SkipLocalBootstrap {
|
if !opts.SkipLocalBootstrap {
|
||||||
ready, readyErr := o.fluxSourceReady(ctx)
|
ready, readyErr := o.fluxSourceReady(ctx)
|
||||||
if readyErr != nil {
|
if readyErr != nil {
|
||||||
o.log.Printf("warning: unable to read flux source readiness: %v", readyErr)
|
o.log.Printf("warning: unable to read flux source readiness: %v", readyErr)
|
||||||
needsBootstrap = true
|
needsLocalBootstrap = true
|
||||||
bootstrapReasons = append(bootstrapReasons, "flux source readiness check failed")
|
bootstrapReasons = append(bootstrapReasons, "flux source readiness check failed")
|
||||||
}
|
}
|
||||||
if !ready {
|
if !ready {
|
||||||
needsBootstrap = true
|
needsLocalBootstrap = true
|
||||||
bootstrapReasons = append(bootstrapReasons, "flux source not ready")
|
bootstrapReasons = append(bootstrapReasons, "flux source not ready")
|
||||||
}
|
}
|
||||||
|
|
||||||
missing, missingErr := o.missingCriticalStartupWorkloads(ctx)
|
|
||||||
if missingErr != nil {
|
|
||||||
o.log.Printf("warning: unable to inspect critical startup workloads: %v", missingErr)
|
|
||||||
needsBootstrap = true
|
|
||||||
bootstrapReasons = append(bootstrapReasons, "critical workload readiness check failed")
|
|
||||||
}
|
|
||||||
if len(missing) > 0 {
|
|
||||||
needsBootstrap = true
|
|
||||||
bootstrapReasons = append(bootstrapReasons, "critical workloads not ready: "+strings.Join(missing, ", "))
|
|
||||||
}
|
|
||||||
|
|
||||||
if needsBootstrap {
|
|
||||||
o.log.Printf("startup bootstrap required: %s", strings.Join(bootstrapReasons, "; "))
|
|
||||||
if err := o.bootstrapLocal(ctx); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
missing, missingErr := o.missingCriticalStartupWorkloads(ctx)
|
||||||
|
if missingErr != nil {
|
||||||
|
o.log.Printf("warning: unable to inspect critical startup workloads: %v", missingErr)
|
||||||
|
}
|
||||||
|
if len(missing) > 0 {
|
||||||
|
o.log.Printf("startup critical workloads not ready; applying targeted recovery first: %s", strings.Join(missing, ", "))
|
||||||
|
}
|
||||||
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
|
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if !opts.SkipLocalBootstrap && needsBootstrap {
|
if !opts.SkipLocalBootstrap && needsLocalBootstrap {
|
||||||
|
o.log.Printf("startup bootstrap required: %s", strings.Join(bootstrapReasons, "; "))
|
||||||
|
if err := o.bootstrapLocal(ctx); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := o.ensureCriticalStartupWorkloads(ctx); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
ready, err := o.fluxSourceReady(ctx)
|
ready, err := o.fluxSourceReady(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("flux source readiness after bootstrap: %w", err)
|
return fmt.Errorf("flux source readiness after bootstrap: %w", err)
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -Eeuo pipefail
|
||||||
|
|
||||||
KUBECTL="${KUBECTL:-kubectl}"
|
KUBECTL="${KUBECTL:-kubectl}"
|
||||||
HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}"
|
HECATE_COORDINATOR_HOST="${HECATE_COORDINATOR_HOST:-titan-db}"
|
||||||
@ -84,6 +84,21 @@ run_hecate_startup() {
|
|||||||
declare -A SNAPSHOT_REPLICAS=()
|
declare -A SNAPSHOT_REPLICAS=()
|
||||||
SUSPENDED_KS_BEFORE=""
|
SUSPENDED_KS_BEFORE=""
|
||||||
SUSPENDED_HR_BEFORE=""
|
SUSPENDED_HR_BEFORE=""
|
||||||
|
CURRENT_DRILL=""
|
||||||
|
CURRENT_RESOURCES=()
|
||||||
|
ROLLBACK_FLUX_SUSPEND=0
|
||||||
|
|
||||||
|
on_err() {
|
||||||
|
local code=$?
|
||||||
|
log "failure detected in drill '${CURRENT_DRILL}' (exit=${code}); starting rollback"
|
||||||
|
if [[ "${ROLLBACK_FLUX_SUSPEND}" -eq 1 ]]; then
|
||||||
|
restore_flux_suspended_before || true
|
||||||
|
fi
|
||||||
|
if [[ ${#CURRENT_RESOURCES[@]} -gt 0 ]]; then
|
||||||
|
restore_resources "${CURRENT_RESOURCES[@]}" || true
|
||||||
|
fi
|
||||||
|
exit "${code}"
|
||||||
|
}
|
||||||
|
|
||||||
snapshot_resources() {
|
snapshot_resources() {
|
||||||
local resources=("$@")
|
local resources=("$@")
|
||||||
@ -152,16 +167,27 @@ restore_flux_suspended_before() {
|
|||||||
done <<<"${SUSPENDED_HR_BEFORE}"
|
done <<<"${SUSPENDED_HR_BEFORE}"
|
||||||
}
|
}
|
||||||
|
|
||||||
verify_flux_unsuspended() {
|
normalize_lines() {
|
||||||
|
sed '/^$/d' | sort
|
||||||
|
}
|
||||||
|
|
||||||
|
verify_flux_suspend_state_restored() {
|
||||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
log "plan: verify no Flux kustomizations/helmreleases remain suspended"
|
log "plan: verify Flux suspended objects match pre-drill state"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
local ks_count hr_count
|
local current_ks current_hr
|
||||||
ks_count="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')"
|
current_ks="$("${KUBECTL}" get kustomizations.kustomize.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
|
||||||
hr_count="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.name}{"\n"}{end}' | sed '/^$/d' | wc -l | tr -d ' ')"
|
current_hr="$("${KUBECTL}" get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath='{range .items[?(@.spec.suspend==true)]}{.metadata.namespace}/{.metadata.name}{"\n"}{end}' || true)"
|
||||||
[[ "${ks_count}" == "0" ]] || die "kustomizations still suspended: ${ks_count}"
|
|
||||||
[[ "${hr_count}" == "0" ]] || die "helmreleases still suspended: ${hr_count}"
|
local expected_ks expected_hr got_ks got_hr
|
||||||
|
expected_ks="$(printf '%s\n' "${SUSPENDED_KS_BEFORE}" | normalize_lines)"
|
||||||
|
expected_hr="$(printf '%s\n' "${SUSPENDED_HR_BEFORE}" | normalize_lines)"
|
||||||
|
got_ks="$(printf '%s\n' "${current_ks}" | normalize_lines)"
|
||||||
|
got_hr="$(printf '%s\n' "${current_hr}" | normalize_lines)"
|
||||||
|
|
||||||
|
[[ "${got_ks}" == "${expected_ks}" ]] || die "kustomization suspend-state drift detected"
|
||||||
|
[[ "${got_hr}" == "${expected_hr}" ]] || die "helmrelease suspend-state drift detected"
|
||||||
}
|
}
|
||||||
|
|
||||||
write_log_header() {
|
write_log_header() {
|
||||||
@ -173,18 +199,18 @@ write_log_header() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_drill_flux_gitea_deadlock() {
|
run_drill_flux_gitea_deadlock() {
|
||||||
local resources=(
|
CURRENT_RESOURCES=(
|
||||||
"flux-system|deployment|source-controller|1"
|
"flux-system|deployment|source-controller|1"
|
||||||
"flux-system|deployment|kustomize-controller|1"
|
"flux-system|deployment|kustomize-controller|1"
|
||||||
"flux-system|deployment|helm-controller|1"
|
"flux-system|deployment|helm-controller|1"
|
||||||
"flux-system|deployment|notification-controller|1"
|
"flux-system|deployment|notification-controller|1"
|
||||||
"gitea|deployment|gitea|1"
|
"gitea|deployment|gitea|1"
|
||||||
)
|
)
|
||||||
snapshot_resources "${resources[@]}"
|
snapshot_resources "${CURRENT_RESOURCES[@]}"
|
||||||
trap 'restore_resources "${resources[@]}"' ERR
|
ROLLBACK_FLUX_SUSPEND=0
|
||||||
|
|
||||||
log "injecting outage: flux controllers + gitea"
|
log "injecting outage: flux controllers + gitea"
|
||||||
for res in "${resources[@]}"; do
|
for res in "${CURRENT_RESOURCES[@]}"; do
|
||||||
IFS='|' read -r ns kind name _ <<<"${res}"
|
IFS='|' read -r ns kind name _ <<<"${res}"
|
||||||
scale_to "$ns" "$kind" "$name" 0
|
scale_to "$ns" "$kind" "$name" 0
|
||||||
done
|
done
|
||||||
@ -198,20 +224,20 @@ run_drill_flux_gitea_deadlock() {
|
|||||||
wait_ready flux-system deployment notification-controller 240s
|
wait_ready flux-system deployment notification-controller 240s
|
||||||
wait_ready gitea deployment gitea 300s
|
wait_ready gitea deployment gitea 300s
|
||||||
log "pass: flux-gitea-deadlock"
|
log "pass: flux-gitea-deadlock"
|
||||||
trap - ERR
|
CURRENT_RESOURCES=()
|
||||||
}
|
}
|
||||||
|
|
||||||
run_drill_foundation_recovery() {
|
run_drill_foundation_recovery() {
|
||||||
local resources=(
|
CURRENT_RESOURCES=(
|
||||||
"vault|statefulset|vault|1"
|
"vault|statefulset|vault|1"
|
||||||
"postgres|statefulset|postgres|1"
|
"postgres|statefulset|postgres|1"
|
||||||
"gitea|deployment|gitea|1"
|
"gitea|deployment|gitea|1"
|
||||||
)
|
)
|
||||||
snapshot_resources "${resources[@]}"
|
snapshot_resources "${CURRENT_RESOURCES[@]}"
|
||||||
trap 'restore_resources "${resources[@]}"' ERR
|
ROLLBACK_FLUX_SUSPEND=0
|
||||||
|
|
||||||
log "injecting outage: vault + postgres + gitea"
|
log "injecting outage: vault + postgres + gitea"
|
||||||
for res in "${resources[@]}"; do
|
for res in "${CURRENT_RESOURCES[@]}"; do
|
||||||
IFS='|' read -r ns kind name _ <<<"${res}"
|
IFS='|' read -r ns kind name _ <<<"${res}"
|
||||||
scale_to "$ns" "$kind" "$name" 0
|
scale_to "$ns" "$kind" "$name" 0
|
||||||
done
|
done
|
||||||
@ -223,14 +249,14 @@ run_drill_foundation_recovery() {
|
|||||||
wait_ready postgres statefulset postgres 420s
|
wait_ready postgres statefulset postgres 420s
|
||||||
wait_ready gitea deployment gitea 300s
|
wait_ready gitea deployment gitea 300s
|
||||||
log "pass: foundation-recovery"
|
log "pass: foundation-recovery"
|
||||||
trap - ERR
|
CURRENT_RESOURCES=()
|
||||||
}
|
}
|
||||||
|
|
||||||
run_drill_reconciliation_resume() {
|
run_drill_reconciliation_resume() {
|
||||||
local resources=("flux-system|deployment|source-controller|1")
|
CURRENT_RESOURCES=("flux-system|deployment|source-controller|1")
|
||||||
snapshot_resources "${resources[@]}"
|
snapshot_resources "${CURRENT_RESOURCES[@]}"
|
||||||
record_flux_suspended_before
|
record_flux_suspended_before
|
||||||
trap 'restore_flux_suspended_before; restore_resources "${resources[@]}"' ERR
|
ROLLBACK_FLUX_SUSPEND=1
|
||||||
|
|
||||||
log "injecting outage: suspend all Flux objects + stop source-controller"
|
log "injecting outage: suspend all Flux objects + stop source-controller"
|
||||||
set_flux_suspend_all true
|
set_flux_suspend_all true
|
||||||
@ -240,15 +266,17 @@ run_drill_reconciliation_resume() {
|
|||||||
|
|
||||||
log "verifying reconciliation resumed"
|
log "verifying reconciliation resumed"
|
||||||
wait_ready flux-system deployment source-controller 240s
|
wait_ready flux-system deployment source-controller 240s
|
||||||
verify_flux_unsuspended
|
verify_flux_suspend_state_restored
|
||||||
log "pass: reconciliation-resume"
|
log "pass: reconciliation-resume"
|
||||||
trap - ERR
|
CURRENT_RESOURCES=()
|
||||||
|
ROLLBACK_FLUX_SUSPEND=0
|
||||||
}
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
need_cmd "${KUBECTL}"
|
need_cmd "${KUBECTL}"
|
||||||
need_cmd ssh
|
need_cmd ssh
|
||||||
need_cmd timeout
|
need_cmd timeout
|
||||||
|
trap on_err ERR
|
||||||
|
|
||||||
local cmd="${1:-}"
|
local cmd="${1:-}"
|
||||||
case "${cmd}" in
|
case "${cmd}" in
|
||||||
@ -269,6 +297,7 @@ main() {
|
|||||||
shift
|
shift
|
||||||
done
|
done
|
||||||
write_log_header "${drill}"
|
write_log_header "${drill}"
|
||||||
|
CURRENT_DRILL="${drill}"
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
usage
|
usage
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user