From c8c3304797a304faf8dca17a70769fdc7e1de7e7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 5 Apr 2026 20:25:14 -0300 Subject: [PATCH] startup: unblock on harbor during recovery and add controlled-cycle drill --- README.md | 3 + cmd/hecate/main.go | 10 ++- configs/hecate.example.yaml | 1 + configs/hecate.tethys.yaml | 1 + configs/hecate.titan-db.yaml | 1 + internal/cluster/orchestrator.go | 115 ++++++++++++++++++++++++++++--- internal/config/config.go | 8 +++ internal/config/config_test.go | 11 +++ scripts/hecate-drills.sh | 98 ++++++++++++++++++++++++++ 9 files changed, 237 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index bdaa491..96231d5 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,7 @@ See `configs/hecate.example.yaml`. Break-glass unseal fallback knobs: - `startup.vault_unseal_breakglass_command`: optional shell command that prints the unseal key to stdout. - `startup.vault_unseal_breakglass_timeout_seconds`: timeout for the command (default `15`). +- `startup.shutdown_cooldown_seconds`: cooldown window after shutdown completion before startup proceeds (default `45`). UPS auto-shutdown trigger uses: - runtime threshold = `runtime_safety_factor * estimated_shutdown_budget` @@ -119,6 +120,7 @@ Power metrics: - Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set. - Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other. +- Startup now waits out the recent-shutdown cooldown window instead of failing immediately when shutdown completed moments ago. - In multi-instance setups, set `coordination.peer_hosts` on each host (for example `titan-db` <-> `titan-24`) so startup guards account for remote intent too. - `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically. - `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default. @@ -144,5 +146,6 @@ Hecate includes scripted disruptive drills that intentionally break critical ser - `scripts/hecate-drills.sh run flux-gitea-deadlock --execute` - `scripts/hecate-drills.sh run foundation-recovery --execute` - `scripts/hecate-drills.sh run reconciliation-resume --execute` +- `scripts/hecate-drills.sh run controlled-cycle --execute` (uses `HECATE_DRILL_SHUTDOWN_CONFIG`, defaults to `/tmp/hecate-drill-no-poweroff.yaml`) These drills are intentionally **not** part of regular `go test ./...`. diff --git a/cmd/hecate/main.go b/cmd/hecate/main.go index 90c9463..61933cf 100644 --- a/cmd/hecate/main.go +++ b/cmd/hecate/main.go @@ -520,7 +520,7 @@ func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config if remoteIntent.UpdatedAt.IsZero() { return false, "coordinator reported shutdown_complete with unknown age", nil } - if intentAge <= 45*time.Second { + if intentAge <= startupShutdownCooldown(cfg) { return false, fmt.Sprintf("coordinator recently completed shutdown (%s ago)", intentAge.Round(time.Second)), nil } return true, "coordinator shutdown_complete is old enough", nil @@ -621,3 +621,11 @@ func maxInt(a, b int) int { } return b } + +func startupShutdownCooldown(cfg config.Config) time.Duration { + seconds := cfg.Startup.ShutdownCooldownSeconds + if seconds <= 0 { + seconds = 45 + } + return time.Duration(seconds) * time.Second +} diff --git a/configs/hecate.example.yaml b/configs/hecate.example.yaml index 98d5f82..fd7b5ee 100644 --- a/configs/hecate.example.yaml +++ b/configs/hecate.example.yaml @@ -45,6 +45,7 @@ excluded_namespaces: startup: api_wait_seconds: 1200 api_poll_seconds: 2 + shutdown_cooldown_seconds: 45 require_time_sync: true time_sync_wait_seconds: 240 time_sync_poll_seconds: 5 diff --git a/configs/hecate.tethys.yaml b/configs/hecate.tethys.yaml index 079035a..4573676 100644 --- a/configs/hecate.tethys.yaml +++ b/configs/hecate.tethys.yaml @@ -111,6 +111,7 @@ excluded_namespaces: startup: api_wait_seconds: 1200 api_poll_seconds: 2 + shutdown_cooldown_seconds: 45 require_time_sync: true time_sync_wait_seconds: 240 time_sync_poll_seconds: 5 diff --git a/configs/hecate.titan-db.yaml b/configs/hecate.titan-db.yaml index 8ef16d2..61277ab 100644 --- a/configs/hecate.titan-db.yaml +++ b/configs/hecate.titan-db.yaml @@ -111,6 +111,7 @@ excluded_namespaces: startup: api_wait_seconds: 1200 api_poll_seconds: 2 + shutdown_cooldown_seconds: 45 require_time_sync: true time_sync_wait_seconds: 240 time_sync_poll_seconds: 5 diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index bdb7bbb..cf438a0 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -77,8 +77,6 @@ var criticalStartupWorkloads = []startupWorkload{ {Namespace: "vault", Kind: "statefulset", Name: "vault"}, {Namespace: "postgres", Kind: "statefulset", Name: "postgres"}, {Namespace: "gitea", Kind: "deployment", Name: "gitea"}, - {Namespace: "harbor", Kind: "statefulset", Name: "harbor-redis"}, - {Namespace: "harbor", Kind: "deployment", Name: "harbor-registry"}, } var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable") @@ -135,8 +133,32 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er } currentIntent = state.Intent{State: state.IntentNormal} } - if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, 45*time.Second) { - return fmt.Errorf("startup blocked: shutdown completed too recently (%s ago)", intentAge(currentIntent).Round(time.Second)) + cooldown := o.startupShutdownCooldown() + if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, cooldown) { + elapsed := intentAge(currentIntent) + remaining := cooldown - elapsed + if remaining < time.Second { + remaining = time.Second + } + o.log.Printf("startup cooldown active: last shutdown completed %s ago; waiting %s", elapsed.Round(time.Second), remaining.Round(time.Second)) + timer := time.NewTimer(remaining) + select { + case <-ctx.Done(): + timer.Stop() + return fmt.Errorf("startup canceled while waiting for shutdown cooldown: %w", ctx.Err()) + case <-timer.C: + } + refreshed, readErr := state.ReadIntent(o.cfg.State.IntentPath) + if readErr != nil { + return fmt.Errorf("re-read startup intent after cooldown wait: %w", readErr) + } + currentIntent = refreshed + if currentIntent.State == state.IntentShuttingDown && intentFresh(currentIntent, o.startupGuardAge()) { + return fmt.Errorf("startup blocked: shutdown intent became active during cooldown wait (%s)", currentIntent.Reason) + } + if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, cooldown) { + return fmt.Errorf("startup blocked: shutdown completed too recently (%s ago)", intentAge(currentIntent).Round(time.Second)) + } } if err := o.guardPeerStartupIntents(ctx); err != nil { return err @@ -789,6 +811,12 @@ func (o *Orchestrator) readScaledWorkloadSnapshot() (*workloadScaleSnapshot, err return &snapshot, nil } +type drainFailure struct { + node string + err error + details string +} + func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error { total := len(workers) if total == 0 { @@ -805,7 +833,7 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error o.log.Printf("drain workers total=%d parallelism=%d", total, parallelism) sem := make(chan struct{}, parallelism) var wg sync.WaitGroup - errCh := make(chan error, total) + errCh := make(chan drainFailure, total) for idx, node := range workers { idx := idx @@ -821,7 +849,12 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error o.log.Printf("warning: cordon %s failed: %v", node, err) } if _, err := o.kubectl(ctx, 3*time.Minute, "drain", node, "--ignore-daemonsets", "--delete-emptydir-data", "--grace-period=30", "--timeout=180s"); err != nil { - errCh <- fmt.Errorf("drain %s failed: %w", node, err) + details := o.drainNodeDiagnostics(ctx, node) + errCh <- drainFailure{ + node: node, + err: fmt.Errorf("drain %s failed: %w", node, err), + details: details, + } return } }() @@ -832,10 +865,18 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error if len(errCh) == 0 { return nil } - count := len(errCh) + failures := make([]drainFailure, 0, len(errCh)) + for failure := range errCh { + failures = append(failures, failure) + } + count := len(failures) samples := []string{} - for err := range errCh { - samples = append(samples, err.Error()) + for _, failure := range failures { + msg := failure.err.Error() + if strings.TrimSpace(failure.details) != "" { + msg = fmt.Sprintf("%s (details: %s)", msg, failure.details) + } + samples = append(samples, msg) if len(samples) >= 4 { break } @@ -843,6 +884,52 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error return fmt.Errorf("drain workers had %d errors (first: %s)", count, strings.Join(samples, " | ")) } +func (o *Orchestrator) drainNodeDiagnostics(ctx context.Context, node string) string { + out, err := o.kubectl( + ctx, + 20*time.Second, + "get", + "pods", + "-A", + "--field-selector", "spec.nodeName="+node, + "-o", + "custom-columns=NS:.metadata.namespace,NAME:.metadata.name,PHASE:.status.phase,OWNER:.metadata.ownerReferences[0].kind", + "--no-headers", + ) + if err != nil { + if strings.TrimSpace(out) == "" { + return fmt.Sprintf("diagnostics unavailable: %v", err) + } + return fmt.Sprintf("diagnostics unavailable: %v (%s)", err, strings.Join(lines(out), "; ")) + } + + blockers := make([]string, 0, 6) + for _, line := range lines(out) { + fields := strings.Fields(line) + if len(fields) < 4 { + continue + } + namespace := fields[0] + name := fields[1] + phase := fields[2] + owner := fields[3] + if strings.EqualFold(owner, "DaemonSet") { + continue + } + if strings.EqualFold(phase, "Succeeded") || strings.EqualFold(phase, "Failed") { + continue + } + blockers = append(blockers, fmt.Sprintf("%s/%s(phase=%s owner=%s)", namespace, name, phase, owner)) + if len(blockers) >= 6 { + break + } + } + if len(blockers) == 0 { + return "no non-daemonset blocking pods found on node" + } + return strings.Join(blockers, ", ") +} + func (o *Orchestrator) uncordonWorkers(ctx context.Context, workers []string) error { for _, node := range workers { if _, err := o.kubectl(ctx, 20*time.Second, "uncordon", node); err != nil { @@ -969,6 +1056,14 @@ func (o *Orchestrator) startupGuardAge() time.Duration { return time.Duration(seconds) * time.Second } +func (o *Orchestrator) startupShutdownCooldown() time.Duration { + seconds := o.cfg.Startup.ShutdownCooldownSeconds + if seconds <= 0 { + seconds = 45 + } + return time.Duration(seconds) * time.Second +} + func (o *Orchestrator) coordinationPeers() []string { seen := map[string]struct{}{} out := make([]string, 0, len(o.cfg.Coordination.PeerHosts)+1) @@ -1018,7 +1113,7 @@ func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error { } o.log.Printf("warning: peer %s startup intent appears stale; allowing startup", peer) case state.IntentShutdownComplete: - if intentFresh(intent, 45*time.Second) { + if intentFresh(intent, o.startupShutdownCooldown()) { return fmt.Errorf("startup blocked: peer %s completed shutdown too recently (age=%s)", peer, intentAge(intent).Round(time.Second)) } default: diff --git a/internal/config/config.go b/internal/config/config.go index c23b241..024be5c 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -36,6 +36,7 @@ type Config struct { type Startup struct { APIWaitSeconds int `yaml:"api_wait_seconds"` APIPollSeconds int `yaml:"api_poll_seconds"` + ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"` RequireTimeSync bool `yaml:"require_time_sync"` TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` @@ -172,6 +173,9 @@ func (c Config) Validate() error { if c.Startup.APIPollSeconds <= 0 { return fmt.Errorf("config.startup.api_poll_seconds must be > 0") } + if c.Startup.ShutdownCooldownSeconds <= 0 { + return fmt.Errorf("config.startup.shutdown_cooldown_seconds must be > 0") + } if c.Startup.TimeSyncWaitSeconds <= 0 { return fmt.Errorf("config.startup.time_sync_wait_seconds must be > 0") } @@ -305,6 +309,7 @@ func defaults() Config { Startup: Startup{ APIWaitSeconds: 1200, APIPollSeconds: 2, + ShutdownCooldownSeconds: 45, RequireTimeSync: true, TimeSyncWaitSeconds: 240, TimeSyncPollSeconds: 5, @@ -394,6 +399,9 @@ func (c *Config) applyDefaults() { if c.Startup.APIPollSeconds <= 0 { c.Startup.APIPollSeconds = 2 } + if c.Startup.ShutdownCooldownSeconds <= 0 { + c.Startup.ShutdownCooldownSeconds = 45 + } if c.Startup.TimeSyncWaitSeconds <= 0 { c.Startup.TimeSyncWaitSeconds = 240 } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 2487aeb..3c21f76 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -106,11 +106,22 @@ state: if cfg.Startup.VaultUnsealKeyFile == "" { t.Fatalf("expected startup vault unseal key file default to be set") } + if cfg.Startup.ShutdownCooldownSeconds <= 0 { + t.Fatalf("expected startup shutdown cooldown default > 0, got %d", cfg.Startup.ShutdownCooldownSeconds) + } if cfg.Startup.VaultUnsealBreakglassTimeout <= 0 { t.Fatalf("expected startup break-glass timeout default > 0, got %d", cfg.Startup.VaultUnsealBreakglassTimeout) } } +func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) { + cfg := defaults() + cfg.Startup.ShutdownCooldownSeconds = 0 + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error for invalid startup shutdown_cooldown_seconds") + } +} + func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) { cfg := defaults() cfg.Startup.TimeSyncMode = "invalid" diff --git a/scripts/hecate-drills.sh b/scripts/hecate-drills.sh index 611f606..51e20cc 100755 --- a/scripts/hecate-drills.sh +++ b/scripts/hecate-drills.sh @@ -8,6 +8,10 @@ HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}" HECATE_COORDINATOR_RELAY="${HECATE_COORDINATOR_RELAY:-}" LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}" STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}" +SHUTDOWN_TIMEOUT_SECONDS="${HECATE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}" +SHUTDOWN_CONFIG="${HECATE_DRILL_SHUTDOWN_CONFIG:-/tmp/hecate-drill-no-poweroff.yaml}" +STARTUP_RETRY_DELAY_SECONDS="${HECATE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}" +STARTUP_RETRY_MAX="${HECATE_DRILL_STARTUP_RETRY_MAX:-12}" EXECUTE=0 usage() { @@ -21,6 +25,7 @@ Drills: foundation-recovery Simulate vault/postgres/gitea outage and require layered restore. reconciliation-resume Simulate global Flux suspend + source-controller down and require resume. startup-intent-guard Assert startup is blocked when shutdown intent is active. + controlled-cycle Run full shutdown->startup recovery cycle (uses no-poweroff config). Notes: - Drills are intentionally disruptive and are not part of regular `make test`. @@ -74,6 +79,25 @@ wait_ready() { "${KUBECTL}" -n "$ns" rollout status "${kind}/${name}" --timeout="${timeout}" >/dev/null } +wait_ready_keycloak() { + local timeout="$1" + if [[ "${EXECUTE}" -eq 0 ]]; then + log "plan: wait for sso keycloak rollout (${timeout}) [deployment preferred, fallback to statefulset]" + return 0 + fi + + if "${KUBECTL}" -n sso get deployment keycloak >/dev/null 2>&1; then + wait_ready sso deployment keycloak "${timeout}" + return 0 + fi + if "${KUBECTL}" -n sso get statefulset keycloak >/dev/null 2>&1; then + wait_ready sso statefulset keycloak "${timeout}" + return 0 + fi + + die "keycloak workload not found in sso namespace (expected deployment/keycloak or statefulset/keycloak)" +} + run_hecate_startup() { local reason="$1" local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main --reason "${reason}") @@ -93,6 +117,48 @@ run_hecate_startup() { fi } +run_hecate_shutdown() { + local reason="$1" + local cmd=(sudo "${HECATE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}") + if [[ "${EXECUTE}" -eq 0 ]]; then + if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then + log "plan: ssh ${HECATE_COORDINATOR_HOST} ${HECATE_COORDINATOR_RELAY} '${cmd[*]}'" + else + log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'" + fi + return 0 + fi + if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then + # shellcheck disable=SC2086 + timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "${cmd[@]}" + else + timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}" + fi +} + +run_hecate_startup_with_retry() { + local reason="$1" + local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason ${reason}" + + if [[ "${EXECUTE}" -eq 0 ]]; then + log "plan: startup retry loop with max=${STARTUP_RETRY_MAX} delay=${STARTUP_RETRY_DELAY_SECONDS}s" + return 0 + fi + + local attempt + for attempt in $(seq 1 "${STARTUP_RETRY_MAX}"); do + log "startup attempt ${attempt}/${STARTUP_RETRY_MAX}" + if run_coordinator_bash "${startup_cmd}"; then + return 0 + fi + if [[ "${attempt}" -lt "${STARTUP_RETRY_MAX}" ]]; then + log "startup attempt ${attempt} failed; retrying in ${STARTUP_RETRY_DELAY_SECONDS}s" + sleep "${STARTUP_RETRY_DELAY_SECONDS}" + fi + done + die "startup failed after ${STARTUP_RETRY_MAX} attempts" +} + run_coordinator_bash() { local script="$1" if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then @@ -329,6 +395,35 @@ fi log "pass: startup-intent-guard" } +run_drill_controlled_cycle() { + CURRENT_RESOURCES=() + ROLLBACK_FLUX_SUSPEND=0 + + if [[ "${EXECUTE}" -eq 0 ]]; then + log "plan: verify shutdown drill config exists on coordinator (${SHUTDOWN_CONFIG})" + else + run_coordinator_bash "[ -s '${SHUTDOWN_CONFIG}' ]" || die "shutdown drill config missing on coordinator: ${SHUTDOWN_CONFIG}" + fi + + log "running controlled shutdown cycle (poweroff disabled config)" + run_hecate_shutdown "drill-controlled-cycle-shutdown" + + log "running startup recovery cycle" + run_hecate_startup_with_retry "drill-controlled-cycle-startup" + + log "verifying critical stack readiness after cycle" + wait_ready flux-system deployment source-controller 240s + wait_ready flux-system deployment kustomize-controller 240s + wait_ready flux-system deployment helm-controller 240s + wait_ready flux-system deployment notification-controller 240s + wait_ready vault statefulset vault 420s + wait_ready postgres statefulset postgres 420s + wait_ready gitea deployment gitea 300s + wait_ready_keycloak 420s + wait_ready maintenance deployment metis 300s + log "pass: controlled-cycle" +} + main() { need_cmd "${KUBECTL}" need_cmd ssh @@ -375,6 +470,9 @@ main() { startup-intent-guard) run_drill_startup_intent_guard ;; + controlled-cycle) + run_drill_controlled_cycle + ;; *) die "unknown drill: ${drill}" ;;