startup: unblock on harbor during recovery and add controlled-cycle drill

This commit is contained in:
Brad Stein 2026-04-05 20:25:14 -03:00
parent 11a2f66e41
commit c8c3304797
9 changed files with 237 additions and 11 deletions

View File

@ -99,6 +99,7 @@ See `configs/hecate.example.yaml`.
Break-glass unseal fallback knobs:
- `startup.vault_unseal_breakglass_command`: optional shell command that prints the unseal key to stdout.
- `startup.vault_unseal_breakglass_timeout_seconds`: timeout for the command (default `15`).
- `startup.shutdown_cooldown_seconds`: cooldown window after shutdown completion before startup proceeds (default `45`).
UPS auto-shutdown trigger uses:
- runtime threshold = `runtime_safety_factor * estimated_shutdown_budget`
@ -119,6 +120,7 @@ Power metrics:
- Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
- Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other.
- Startup now waits out the recent-shutdown cooldown window instead of failing immediately when shutdown completed moments ago.
- In multi-instance setups, set `coordination.peer_hosts` on each host (for example `titan-db` <-> `titan-24`) so startup guards account for remote intent too.
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
- `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default.
@ -144,5 +146,6 @@ Hecate includes scripted disruptive drills that intentionally break critical ser
- `scripts/hecate-drills.sh run flux-gitea-deadlock --execute`
- `scripts/hecate-drills.sh run foundation-recovery --execute`
- `scripts/hecate-drills.sh run reconciliation-resume --execute`
- `scripts/hecate-drills.sh run controlled-cycle --execute` (uses `HECATE_DRILL_SHUTDOWN_CONFIG`, defaults to `/tmp/hecate-drill-no-poweroff.yaml`)
These drills are intentionally **not** part of regular `go test ./...`.

View File

@ -520,7 +520,7 @@ func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config
if remoteIntent.UpdatedAt.IsZero() {
return false, "coordinator reported shutdown_complete with unknown age", nil
}
if intentAge <= 45*time.Second {
if intentAge <= startupShutdownCooldown(cfg) {
return false, fmt.Sprintf("coordinator recently completed shutdown (%s ago)", intentAge.Round(time.Second)), nil
}
return true, "coordinator shutdown_complete is old enough", nil
@ -621,3 +621,11 @@ func maxInt(a, b int) int {
}
return b
}
func startupShutdownCooldown(cfg config.Config) time.Duration {
seconds := cfg.Startup.ShutdownCooldownSeconds
if seconds <= 0 {
seconds = 45
}
return time.Duration(seconds) * time.Second
}

View File

@ -45,6 +45,7 @@ excluded_namespaces:
startup:
api_wait_seconds: 1200
api_poll_seconds: 2
shutdown_cooldown_seconds: 45
require_time_sync: true
time_sync_wait_seconds: 240
time_sync_poll_seconds: 5

View File

@ -111,6 +111,7 @@ excluded_namespaces:
startup:
api_wait_seconds: 1200
api_poll_seconds: 2
shutdown_cooldown_seconds: 45
require_time_sync: true
time_sync_wait_seconds: 240
time_sync_poll_seconds: 5

View File

@ -111,6 +111,7 @@ excluded_namespaces:
startup:
api_wait_seconds: 1200
api_poll_seconds: 2
shutdown_cooldown_seconds: 45
require_time_sync: true
time_sync_wait_seconds: 240
time_sync_poll_seconds: 5

View File

@ -77,8 +77,6 @@ var criticalStartupWorkloads = []startupWorkload{
{Namespace: "vault", Kind: "statefulset", Name: "vault"},
{Namespace: "postgres", Kind: "statefulset", Name: "postgres"},
{Namespace: "gitea", Kind: "deployment", Name: "gitea"},
{Namespace: "harbor", Kind: "statefulset", Name: "harbor-redis"},
{Namespace: "harbor", Kind: "deployment", Name: "harbor-registry"},
}
var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable")
@ -135,8 +133,32 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
}
currentIntent = state.Intent{State: state.IntentNormal}
}
if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, 45*time.Second) {
return fmt.Errorf("startup blocked: shutdown completed too recently (%s ago)", intentAge(currentIntent).Round(time.Second))
cooldown := o.startupShutdownCooldown()
if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, cooldown) {
elapsed := intentAge(currentIntent)
remaining := cooldown - elapsed
if remaining < time.Second {
remaining = time.Second
}
o.log.Printf("startup cooldown active: last shutdown completed %s ago; waiting %s", elapsed.Round(time.Second), remaining.Round(time.Second))
timer := time.NewTimer(remaining)
select {
case <-ctx.Done():
timer.Stop()
return fmt.Errorf("startup canceled while waiting for shutdown cooldown: %w", ctx.Err())
case <-timer.C:
}
refreshed, readErr := state.ReadIntent(o.cfg.State.IntentPath)
if readErr != nil {
return fmt.Errorf("re-read startup intent after cooldown wait: %w", readErr)
}
currentIntent = refreshed
if currentIntent.State == state.IntentShuttingDown && intentFresh(currentIntent, o.startupGuardAge()) {
return fmt.Errorf("startup blocked: shutdown intent became active during cooldown wait (%s)", currentIntent.Reason)
}
if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, cooldown) {
return fmt.Errorf("startup blocked: shutdown completed too recently (%s ago)", intentAge(currentIntent).Round(time.Second))
}
}
if err := o.guardPeerStartupIntents(ctx); err != nil {
return err
@ -789,6 +811,12 @@ func (o *Orchestrator) readScaledWorkloadSnapshot() (*workloadScaleSnapshot, err
return &snapshot, nil
}
type drainFailure struct {
node string
err error
details string
}
func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error {
total := len(workers)
if total == 0 {
@ -805,7 +833,7 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error
o.log.Printf("drain workers total=%d parallelism=%d", total, parallelism)
sem := make(chan struct{}, parallelism)
var wg sync.WaitGroup
errCh := make(chan error, total)
errCh := make(chan drainFailure, total)
for idx, node := range workers {
idx := idx
@ -821,7 +849,12 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error
o.log.Printf("warning: cordon %s failed: %v", node, err)
}
if _, err := o.kubectl(ctx, 3*time.Minute, "drain", node, "--ignore-daemonsets", "--delete-emptydir-data", "--grace-period=30", "--timeout=180s"); err != nil {
errCh <- fmt.Errorf("drain %s failed: %w", node, err)
details := o.drainNodeDiagnostics(ctx, node)
errCh <- drainFailure{
node: node,
err: fmt.Errorf("drain %s failed: %w", node, err),
details: details,
}
return
}
}()
@ -832,10 +865,18 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error
if len(errCh) == 0 {
return nil
}
count := len(errCh)
failures := make([]drainFailure, 0, len(errCh))
for failure := range errCh {
failures = append(failures, failure)
}
count := len(failures)
samples := []string{}
for err := range errCh {
samples = append(samples, err.Error())
for _, failure := range failures {
msg := failure.err.Error()
if strings.TrimSpace(failure.details) != "" {
msg = fmt.Sprintf("%s (details: %s)", msg, failure.details)
}
samples = append(samples, msg)
if len(samples) >= 4 {
break
}
@ -843,6 +884,52 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error
return fmt.Errorf("drain workers had %d errors (first: %s)", count, strings.Join(samples, " | "))
}
func (o *Orchestrator) drainNodeDiagnostics(ctx context.Context, node string) string {
out, err := o.kubectl(
ctx,
20*time.Second,
"get",
"pods",
"-A",
"--field-selector", "spec.nodeName="+node,
"-o",
"custom-columns=NS:.metadata.namespace,NAME:.metadata.name,PHASE:.status.phase,OWNER:.metadata.ownerReferences[0].kind",
"--no-headers",
)
if err != nil {
if strings.TrimSpace(out) == "" {
return fmt.Sprintf("diagnostics unavailable: %v", err)
}
return fmt.Sprintf("diagnostics unavailable: %v (%s)", err, strings.Join(lines(out), "; "))
}
blockers := make([]string, 0, 6)
for _, line := range lines(out) {
fields := strings.Fields(line)
if len(fields) < 4 {
continue
}
namespace := fields[0]
name := fields[1]
phase := fields[2]
owner := fields[3]
if strings.EqualFold(owner, "DaemonSet") {
continue
}
if strings.EqualFold(phase, "Succeeded") || strings.EqualFold(phase, "Failed") {
continue
}
blockers = append(blockers, fmt.Sprintf("%s/%s(phase=%s owner=%s)", namespace, name, phase, owner))
if len(blockers) >= 6 {
break
}
}
if len(blockers) == 0 {
return "no non-daemonset blocking pods found on node"
}
return strings.Join(blockers, ", ")
}
func (o *Orchestrator) uncordonWorkers(ctx context.Context, workers []string) error {
for _, node := range workers {
if _, err := o.kubectl(ctx, 20*time.Second, "uncordon", node); err != nil {
@ -969,6 +1056,14 @@ func (o *Orchestrator) startupGuardAge() time.Duration {
return time.Duration(seconds) * time.Second
}
func (o *Orchestrator) startupShutdownCooldown() time.Duration {
seconds := o.cfg.Startup.ShutdownCooldownSeconds
if seconds <= 0 {
seconds = 45
}
return time.Duration(seconds) * time.Second
}
func (o *Orchestrator) coordinationPeers() []string {
seen := map[string]struct{}{}
out := make([]string, 0, len(o.cfg.Coordination.PeerHosts)+1)
@ -1018,7 +1113,7 @@ func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error {
}
o.log.Printf("warning: peer %s startup intent appears stale; allowing startup", peer)
case state.IntentShutdownComplete:
if intentFresh(intent, 45*time.Second) {
if intentFresh(intent, o.startupShutdownCooldown()) {
return fmt.Errorf("startup blocked: peer %s completed shutdown too recently (age=%s)", peer, intentAge(intent).Round(time.Second))
}
default:

View File

@ -36,6 +36,7 @@ type Config struct {
type Startup struct {
APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"`
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
RequireTimeSync bool `yaml:"require_time_sync"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
@ -172,6 +173,9 @@ func (c Config) Validate() error {
if c.Startup.APIPollSeconds <= 0 {
return fmt.Errorf("config.startup.api_poll_seconds must be > 0")
}
if c.Startup.ShutdownCooldownSeconds <= 0 {
return fmt.Errorf("config.startup.shutdown_cooldown_seconds must be > 0")
}
if c.Startup.TimeSyncWaitSeconds <= 0 {
return fmt.Errorf("config.startup.time_sync_wait_seconds must be > 0")
}
@ -305,6 +309,7 @@ func defaults() Config {
Startup: Startup{
APIWaitSeconds: 1200,
APIPollSeconds: 2,
ShutdownCooldownSeconds: 45,
RequireTimeSync: true,
TimeSyncWaitSeconds: 240,
TimeSyncPollSeconds: 5,
@ -394,6 +399,9 @@ func (c *Config) applyDefaults() {
if c.Startup.APIPollSeconds <= 0 {
c.Startup.APIPollSeconds = 2
}
if c.Startup.ShutdownCooldownSeconds <= 0 {
c.Startup.ShutdownCooldownSeconds = 45
}
if c.Startup.TimeSyncWaitSeconds <= 0 {
c.Startup.TimeSyncWaitSeconds = 240
}

View File

@ -106,11 +106,22 @@ state:
if cfg.Startup.VaultUnsealKeyFile == "" {
t.Fatalf("expected startup vault unseal key file default to be set")
}
if cfg.Startup.ShutdownCooldownSeconds <= 0 {
t.Fatalf("expected startup shutdown cooldown default > 0, got %d", cfg.Startup.ShutdownCooldownSeconds)
}
if cfg.Startup.VaultUnsealBreakglassTimeout <= 0 {
t.Fatalf("expected startup break-glass timeout default > 0, got %d", cfg.Startup.VaultUnsealBreakglassTimeout)
}
}
func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) {
cfg := defaults()
cfg.Startup.ShutdownCooldownSeconds = 0
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for invalid startup shutdown_cooldown_seconds")
}
}
func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {
cfg := defaults()
cfg.Startup.TimeSyncMode = "invalid"

View File

@ -8,6 +8,10 @@ HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}"
HECATE_COORDINATOR_RELAY="${HECATE_COORDINATOR_RELAY:-}"
LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}"
STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
SHUTDOWN_TIMEOUT_SECONDS="${HECATE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}"
SHUTDOWN_CONFIG="${HECATE_DRILL_SHUTDOWN_CONFIG:-/tmp/hecate-drill-no-poweroff.yaml}"
STARTUP_RETRY_DELAY_SECONDS="${HECATE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}"
STARTUP_RETRY_MAX="${HECATE_DRILL_STARTUP_RETRY_MAX:-12}"
EXECUTE=0
usage() {
@ -21,6 +25,7 @@ Drills:
foundation-recovery Simulate vault/postgres/gitea outage and require layered restore.
reconciliation-resume Simulate global Flux suspend + source-controller down and require resume.
startup-intent-guard Assert startup is blocked when shutdown intent is active.
controlled-cycle Run full shutdown->startup recovery cycle (uses no-poweroff config).
Notes:
- Drills are intentionally disruptive and are not part of regular `make test`.
@ -74,6 +79,25 @@ wait_ready() {
"${KUBECTL}" -n "$ns" rollout status "${kind}/${name}" --timeout="${timeout}" >/dev/null
}
wait_ready_keycloak() {
local timeout="$1"
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: wait for sso keycloak rollout (${timeout}) [deployment preferred, fallback to statefulset]"
return 0
fi
if "${KUBECTL}" -n sso get deployment keycloak >/dev/null 2>&1; then
wait_ready sso deployment keycloak "${timeout}"
return 0
fi
if "${KUBECTL}" -n sso get statefulset keycloak >/dev/null 2>&1; then
wait_ready sso statefulset keycloak "${timeout}"
return 0
fi
die "keycloak workload not found in sso namespace (expected deployment/keycloak or statefulset/keycloak)"
}
run_hecate_startup() {
local reason="$1"
local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main --reason "${reason}")
@ -93,6 +117,48 @@ run_hecate_startup() {
fi
}
run_hecate_shutdown() {
local reason="$1"
local cmd=(sudo "${HECATE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}")
if [[ "${EXECUTE}" -eq 0 ]]; then
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
log "plan: ssh ${HECATE_COORDINATOR_HOST} ${HECATE_COORDINATOR_RELAY} '${cmd[*]}'"
else
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
fi
return 0
fi
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
# shellcheck disable=SC2086
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "${cmd[@]}"
else
timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}"
fi
}
run_hecate_startup_with_retry() {
local reason="$1"
local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason ${reason}"
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: startup retry loop with max=${STARTUP_RETRY_MAX} delay=${STARTUP_RETRY_DELAY_SECONDS}s"
return 0
fi
local attempt
for attempt in $(seq 1 "${STARTUP_RETRY_MAX}"); do
log "startup attempt ${attempt}/${STARTUP_RETRY_MAX}"
if run_coordinator_bash "${startup_cmd}"; then
return 0
fi
if [[ "${attempt}" -lt "${STARTUP_RETRY_MAX}" ]]; then
log "startup attempt ${attempt} failed; retrying in ${STARTUP_RETRY_DELAY_SECONDS}s"
sleep "${STARTUP_RETRY_DELAY_SECONDS}"
fi
done
die "startup failed after ${STARTUP_RETRY_MAX} attempts"
}
run_coordinator_bash() {
local script="$1"
if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
@ -329,6 +395,35 @@ fi
log "pass: startup-intent-guard"
}
run_drill_controlled_cycle() {
CURRENT_RESOURCES=()
ROLLBACK_FLUX_SUSPEND=0
if [[ "${EXECUTE}" -eq 0 ]]; then
log "plan: verify shutdown drill config exists on coordinator (${SHUTDOWN_CONFIG})"
else
run_coordinator_bash "[ -s '${SHUTDOWN_CONFIG}' ]" || die "shutdown drill config missing on coordinator: ${SHUTDOWN_CONFIG}"
fi
log "running controlled shutdown cycle (poweroff disabled config)"
run_hecate_shutdown "drill-controlled-cycle-shutdown"
log "running startup recovery cycle"
run_hecate_startup_with_retry "drill-controlled-cycle-startup"
log "verifying critical stack readiness after cycle"
wait_ready flux-system deployment source-controller 240s
wait_ready flux-system deployment kustomize-controller 240s
wait_ready flux-system deployment helm-controller 240s
wait_ready flux-system deployment notification-controller 240s
wait_ready vault statefulset vault 420s
wait_ready postgres statefulset postgres 420s
wait_ready gitea deployment gitea 300s
wait_ready_keycloak 420s
wait_ready maintenance deployment metis 300s
log "pass: controlled-cycle"
}
main() {
need_cmd "${KUBECTL}"
need_cmd ssh
@ -375,6 +470,9 @@ main() {
startup-intent-guard)
run_drill_startup_intent_guard
;;
controlled-cycle)
run_drill_controlled_cycle
;;
*)
die "unknown drill: ${drill}"
;;