startup: unblock on harbor during recovery and add controlled-cycle drill

2026-04-05 20:25:14 -03:00 · 2026-04-05 20:25:14 -03:00 · c8c3304797
commit c8c3304797
parent 11a2f66e41
9 changed files with 237 additions and 11 deletions
--- a/README.md
+++ b/README.md
@ -99,6 +99,7 @@ See `configs/hecate.example.yaml`.
 Break-glass unseal fallback knobs:
 - `startup.vault_unseal_breakglass_command`: optional shell command that prints the unseal key to stdout.
 - `startup.vault_unseal_breakglass_timeout_seconds`: timeout for the command (default `15`).
+- `startup.shutdown_cooldown_seconds`: cooldown window after shutdown completion before startup proceeds (default `45`).

 UPS auto-shutdown trigger uses:
 - runtime threshold = `runtime_safety_factor * estimated_shutdown_budget`
@ -119,6 +120,7 @@ Power metrics:

 - Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
 - Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other.
+- Startup now waits out the recent-shutdown cooldown window instead of failing immediately when shutdown completed moments ago.
 - In multi-instance setups, set `coordination.peer_hosts` on each host (for example `titan-db` <-> `titan-24`) so startup guards account for remote intent too.
 - `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
 - `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default.
@ -144,5 +146,6 @@ Hecate includes scripted disruptive drills that intentionally break critical ser
 - `scripts/hecate-drills.sh run flux-gitea-deadlock --execute`
 - `scripts/hecate-drills.sh run foundation-recovery --execute`
 - `scripts/hecate-drills.sh run reconciliation-resume --execute`
+- `scripts/hecate-drills.sh run controlled-cycle --execute` (uses `HECATE_DRILL_SHUTDOWN_CONFIG`, defaults to `/tmp/hecate-drill-no-poweroff.yaml`)

 These drills are intentionally **not** part of regular `go test ./...`.
--- a/cmd/hecate/main.go
+++ b/cmd/hecate/main.go
@ -520,7 +520,7 @@ func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config
 		if remoteIntent.UpdatedAt.IsZero() {
 			return false, "coordinator reported shutdown_complete with unknown age", nil
 		}
-		if intentAge <= 45*time.Second {
+		if intentAge <= startupShutdownCooldown(cfg) {
 			return false, fmt.Sprintf("coordinator recently completed shutdown (%s ago)", intentAge.Round(time.Second)), nil
 		}
 		return true, "coordinator shutdown_complete is old enough", nil
@ -621,3 +621,11 @@ func maxInt(a, b int) int {
 	}
 	return b
 }
+
+func startupShutdownCooldown(cfg config.Config) time.Duration {
+	seconds := cfg.Startup.ShutdownCooldownSeconds
+	if seconds <= 0 {
+		seconds = 45
+	}
+	return time.Duration(seconds) * time.Second
+}
--- a/configs/hecate.example.yaml
+++ b/configs/hecate.example.yaml
@ -45,6 +45,7 @@ excluded_namespaces:
 startup:
  api_wait_seconds: 1200
  api_poll_seconds: 2
+  shutdown_cooldown_seconds: 45
  require_time_sync: true
  time_sync_wait_seconds: 240
  time_sync_poll_seconds: 5
--- a/configs/hecate.tethys.yaml
+++ b/configs/hecate.tethys.yaml
@ -111,6 +111,7 @@ excluded_namespaces:
 startup:
  api_wait_seconds: 1200
  api_poll_seconds: 2
+  shutdown_cooldown_seconds: 45
  require_time_sync: true
  time_sync_wait_seconds: 240
  time_sync_poll_seconds: 5
--- a/configs/hecate.titan-db.yaml
+++ b/configs/hecate.titan-db.yaml
@ -111,6 +111,7 @@ excluded_namespaces:
 startup:
  api_wait_seconds: 1200
  api_poll_seconds: 2
+  shutdown_cooldown_seconds: 45
  require_time_sync: true
  time_sync_wait_seconds: 240
  time_sync_poll_seconds: 5
--- a/internal/cluster/orchestrator.go
+++ b/internal/cluster/orchestrator.go
@ -77,8 +77,6 @@ var criticalStartupWorkloads = []startupWorkload{
 	{Namespace: "vault", Kind: "statefulset", Name: "vault"},
 	{Namespace: "postgres", Kind: "statefulset", Name: "postgres"},
 	{Namespace: "gitea", Kind: "deployment", Name: "gitea"},
-	{Namespace: "harbor", Kind: "statefulset", Name: "harbor-redis"},
-	{Namespace: "harbor", Kind: "deployment", Name: "harbor-registry"},
 }

 var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable")
@ -135,8 +133,32 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
 			}
 			currentIntent = state.Intent{State: state.IntentNormal}
 		}
-		if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, 45*time.Second) {
-			return fmt.Errorf("startup blocked: shutdown completed too recently (%s ago)", intentAge(currentIntent).Round(time.Second))
+		cooldown := o.startupShutdownCooldown()
+		if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, cooldown) {
+			elapsed := intentAge(currentIntent)
+			remaining := cooldown - elapsed
+			if remaining < time.Second {
+				remaining = time.Second
+			}
+			o.log.Printf("startup cooldown active: last shutdown completed %s ago; waiting %s", elapsed.Round(time.Second), remaining.Round(time.Second))
+			timer := time.NewTimer(remaining)
+			select {
+			case <-ctx.Done():
+				timer.Stop()
+				return fmt.Errorf("startup canceled while waiting for shutdown cooldown: %w", ctx.Err())
+			case <-timer.C:
+			}
+			refreshed, readErr := state.ReadIntent(o.cfg.State.IntentPath)
+			if readErr != nil {
+				return fmt.Errorf("re-read startup intent after cooldown wait: %w", readErr)
+			}
+			currentIntent = refreshed
+			if currentIntent.State == state.IntentShuttingDown && intentFresh(currentIntent, o.startupGuardAge()) {
+				return fmt.Errorf("startup blocked: shutdown intent became active during cooldown wait (%s)", currentIntent.Reason)
+			}
+			if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, cooldown) {
+				return fmt.Errorf("startup blocked: shutdown completed too recently (%s ago)", intentAge(currentIntent).Round(time.Second))
+			}
 		}
 		if err := o.guardPeerStartupIntents(ctx); err != nil {
 			return err
@ -789,6 +811,12 @@ func (o *Orchestrator) readScaledWorkloadSnapshot() (*workloadScaleSnapshot, err
 	return &snapshot, nil
 }

+type drainFailure struct {
+	node    string
+	err     error
+	details string
+}
+
 func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error {
 	total := len(workers)
 	if total == 0 {
@ -805,7 +833,7 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error
 	o.log.Printf("drain workers total=%d parallelism=%d", total, parallelism)
 	sem := make(chan struct{}, parallelism)
 	var wg sync.WaitGroup
-	errCh := make(chan error, total)
+	errCh := make(chan drainFailure, total)

 	for idx, node := range workers {
 		idx := idx
@ -821,7 +849,12 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error
 				o.log.Printf("warning: cordon %s failed: %v", node, err)
 			}
 			if _, err := o.kubectl(ctx, 3*time.Minute, "drain", node, "--ignore-daemonsets", "--delete-emptydir-data", "--grace-period=30", "--timeout=180s"); err != nil {
-				errCh <- fmt.Errorf("drain %s failed: %w", node, err)
+				details := o.drainNodeDiagnostics(ctx, node)
+				errCh <- drainFailure{
+					node:    node,
+					err:     fmt.Errorf("drain %s failed: %w", node, err),
+					details: details,
+				}
 				return
 			}
 		}()
@ -832,10 +865,18 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error
 	if len(errCh) == 0 {
 		return nil
 	}
-	count := len(errCh)
+	failures := make([]drainFailure, 0, len(errCh))
+	for failure := range errCh {
+		failures = append(failures, failure)
+	}
+	count := len(failures)
 	samples := []string{}
-	for err := range errCh {
-		samples = append(samples, err.Error())
+	for _, failure := range failures {
+		msg := failure.err.Error()
+		if strings.TrimSpace(failure.details) != "" {
+			msg = fmt.Sprintf("%s (details: %s)", msg, failure.details)
+		}
+		samples = append(samples, msg)
 		if len(samples) >= 4 {
 			break
 		}
@ -843,6 +884,52 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error
 	return fmt.Errorf("drain workers had %d errors (first: %s)", count, strings.Join(samples, " | "))
 }

+func (o *Orchestrator) drainNodeDiagnostics(ctx context.Context, node string) string {
+	out, err := o.kubectl(
+		ctx,
+		20*time.Second,
+		"get",
+		"pods",
+		"-A",
+		"--field-selector", "spec.nodeName="+node,
+		"-o",
+		"custom-columns=NS:.metadata.namespace,NAME:.metadata.name,PHASE:.status.phase,OWNER:.metadata.ownerReferences[0].kind",
+		"--no-headers",
+	)
+	if err != nil {
+		if strings.TrimSpace(out) == "" {
+			return fmt.Sprintf("diagnostics unavailable: %v", err)
+		}
+		return fmt.Sprintf("diagnostics unavailable: %v (%s)", err, strings.Join(lines(out), "; "))
+	}
+
+	blockers := make([]string, 0, 6)
+	for _, line := range lines(out) {
+		fields := strings.Fields(line)
+		if len(fields) < 4 {
+			continue
+		}
+		namespace := fields[0]
+		name := fields[1]
+		phase := fields[2]
+		owner := fields[3]
+		if strings.EqualFold(owner, "DaemonSet") {
+			continue
+		}
+		if strings.EqualFold(phase, "Succeeded") || strings.EqualFold(phase, "Failed") {
+			continue
+		}
+		blockers = append(blockers, fmt.Sprintf("%s/%s(phase=%s owner=%s)", namespace, name, phase, owner))
+		if len(blockers) >= 6 {
+			break
+		}
+	}
+	if len(blockers) == 0 {
+		return "no non-daemonset blocking pods found on node"
+	}
+	return strings.Join(blockers, ", ")
+}
+
 func (o *Orchestrator) uncordonWorkers(ctx context.Context, workers []string) error {
 	for _, node := range workers {
 		if _, err := o.kubectl(ctx, 20*time.Second, "uncordon", node); err != nil {
@ -969,6 +1056,14 @@ func (o *Orchestrator) startupGuardAge() time.Duration {
 	return time.Duration(seconds) * time.Second
 }

+func (o *Orchestrator) startupShutdownCooldown() time.Duration {
+	seconds := o.cfg.Startup.ShutdownCooldownSeconds
+	if seconds <= 0 {
+		seconds = 45
+	}
+	return time.Duration(seconds) * time.Second
+}
+
 func (o *Orchestrator) coordinationPeers() []string {
 	seen := map[string]struct{}{}
 	out := make([]string, 0, len(o.cfg.Coordination.PeerHosts)+1)
@ -1018,7 +1113,7 @@ func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error {
 			}
 			o.log.Printf("warning: peer %s startup intent appears stale; allowing startup", peer)
 		case state.IntentShutdownComplete:
-			if intentFresh(intent, 45*time.Second) {
+			if intentFresh(intent, o.startupShutdownCooldown()) {
 				return fmt.Errorf("startup blocked: peer %s completed shutdown too recently (age=%s)", peer, intentAge(intent).Round(time.Second))
 			}
 		default:
--- a/internal/config/config.go
+++ b/internal/config/config.go
@ -36,6 +36,7 @@ type Config struct {
 type Startup struct {
 	APIWaitSeconds               int      `yaml:"api_wait_seconds"`
 	APIPollSeconds               int      `yaml:"api_poll_seconds"`
+	ShutdownCooldownSeconds      int      `yaml:"shutdown_cooldown_seconds"`
 	RequireTimeSync              bool     `yaml:"require_time_sync"`
 	TimeSyncWaitSeconds          int      `yaml:"time_sync_wait_seconds"`
 	TimeSyncPollSeconds          int      `yaml:"time_sync_poll_seconds"`
@ -172,6 +173,9 @@ func (c Config) Validate() error {
 	if c.Startup.APIPollSeconds <= 0 {
 		return fmt.Errorf("config.startup.api_poll_seconds must be > 0")
 	}
+	if c.Startup.ShutdownCooldownSeconds <= 0 {
+		return fmt.Errorf("config.startup.shutdown_cooldown_seconds must be > 0")
+	}
 	if c.Startup.TimeSyncWaitSeconds <= 0 {
 		return fmt.Errorf("config.startup.time_sync_wait_seconds must be > 0")
 	}
@ -305,6 +309,7 @@ func defaults() Config {
 		Startup: Startup{
 			APIWaitSeconds:              1200,
 			APIPollSeconds:              2,
+			ShutdownCooldownSeconds:     45,
 			RequireTimeSync:             true,
 			TimeSyncWaitSeconds:         240,
 			TimeSyncPollSeconds:         5,
@ -394,6 +399,9 @@ func (c *Config) applyDefaults() {
 	if c.Startup.APIPollSeconds <= 0 {
 		c.Startup.APIPollSeconds = 2
 	}
+	if c.Startup.ShutdownCooldownSeconds <= 0 {
+		c.Startup.ShutdownCooldownSeconds = 45
+	}
 	if c.Startup.TimeSyncWaitSeconds <= 0 {
 		c.Startup.TimeSyncWaitSeconds = 240
 	}
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@ -106,11 +106,22 @@ state:
 	if cfg.Startup.VaultUnsealKeyFile == "" {
 		t.Fatalf("expected startup vault unseal key file default to be set")
 	}
+	if cfg.Startup.ShutdownCooldownSeconds <= 0 {
+		t.Fatalf("expected startup shutdown cooldown default > 0, got %d", cfg.Startup.ShutdownCooldownSeconds)
+	}
 	if cfg.Startup.VaultUnsealBreakglassTimeout <= 0 {
 		t.Fatalf("expected startup break-glass timeout default > 0, got %d", cfg.Startup.VaultUnsealBreakglassTimeout)
 	}
 }

+func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) {
+	cfg := defaults()
+	cfg.Startup.ShutdownCooldownSeconds = 0
+	if err := cfg.Validate(); err == nil {
+		t.Fatalf("expected validation error for invalid startup shutdown_cooldown_seconds")
+	}
+}
+
 func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.TimeSyncMode = "invalid"
--- a/scripts/hecate-drills.sh
+++ b/scripts/hecate-drills.sh
@ -8,6 +8,10 @@ HECATE_CONFIG="${HECATE_CONFIG:-/etc/hecate/hecate.yaml}"
 HECATE_COORDINATOR_RELAY="${HECATE_COORDINATOR_RELAY:-}"
 LOG_DIR="${HECATE_DRILL_LOG_DIR:-/tmp/hecate-drills}"
 STARTUP_TIMEOUT_SECONDS="${HECATE_DRILL_STARTUP_TIMEOUT_SECONDS:-1800}"
+SHUTDOWN_TIMEOUT_SECONDS="${HECATE_DRILL_SHUTDOWN_TIMEOUT_SECONDS:-1800}"
+SHUTDOWN_CONFIG="${HECATE_DRILL_SHUTDOWN_CONFIG:-/tmp/hecate-drill-no-poweroff.yaml}"
+STARTUP_RETRY_DELAY_SECONDS="${HECATE_DRILL_STARTUP_RETRY_DELAY_SECONDS:-10}"
+STARTUP_RETRY_MAX="${HECATE_DRILL_STARTUP_RETRY_MAX:-12}"
 EXECUTE=0

 usage() {
@ -21,6 +25,7 @@ Drills:
  foundation-recovery    Simulate vault/postgres/gitea outage and require layered restore.
  reconciliation-resume  Simulate global Flux suspend + source-controller down and require resume.
  startup-intent-guard   Assert startup is blocked when shutdown intent is active.
+  controlled-cycle       Run full shutdown->startup recovery cycle (uses no-poweroff config).

 Notes:
  - Drills are intentionally disruptive and are not part of regular `make test`.
@ -74,6 +79,25 @@ wait_ready() {
  "${KUBECTL}" -n "$ns" rollout status "${kind}/${name}" --timeout="${timeout}" >/dev/null
 }

+wait_ready_keycloak() {
+  local timeout="$1"
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "plan: wait for sso keycloak rollout (${timeout}) [deployment preferred, fallback to statefulset]"
+    return 0
+  fi
+
+  if "${KUBECTL}" -n sso get deployment keycloak >/dev/null 2>&1; then
+    wait_ready sso deployment keycloak "${timeout}"
+    return 0
+  fi
+  if "${KUBECTL}" -n sso get statefulset keycloak >/dev/null 2>&1; then
+    wait_ready sso statefulset keycloak "${timeout}"
+    return 0
+  fi
+
+  die "keycloak workload not found in sso namespace (expected deployment/keycloak or statefulset/keycloak)"
+}
+
 run_hecate_startup() {
  local reason="$1"
  local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main --reason "${reason}")
@ -93,6 +117,48 @@ run_hecate_startup() {
  fi
 }

+run_hecate_shutdown() {
+  local reason="$1"
+  local cmd=(sudo "${HECATE_BIN}" shutdown --config "${SHUTDOWN_CONFIG}" --execute --reason "${reason}")
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
+      log "plan: ssh ${HECATE_COORDINATOR_HOST} ${HECATE_COORDINATOR_RELAY} '${cmd[*]}'"
+    else
+      log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
+    fi
+    return 0
+  fi
+  if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
+    # shellcheck disable=SC2086
+    timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" ${HECATE_COORDINATOR_RELAY} "${cmd[@]}"
+  else
+    timeout "${SHUTDOWN_TIMEOUT_SECONDS}" ssh "${HECATE_COORDINATOR_HOST}" "${cmd[@]}"
+  fi
+}
+
+run_hecate_startup_with_retry() {
+  local reason="$1"
+  local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason ${reason}"
+
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "plan: startup retry loop with max=${STARTUP_RETRY_MAX} delay=${STARTUP_RETRY_DELAY_SECONDS}s"
+    return 0
+  fi
+
+  local attempt
+  for attempt in $(seq 1 "${STARTUP_RETRY_MAX}"); do
+    log "startup attempt ${attempt}/${STARTUP_RETRY_MAX}"
+    if run_coordinator_bash "${startup_cmd}"; then
+      return 0
+    fi
+    if [[ "${attempt}" -lt "${STARTUP_RETRY_MAX}" ]]; then
+      log "startup attempt ${attempt} failed; retrying in ${STARTUP_RETRY_DELAY_SECONDS}s"
+      sleep "${STARTUP_RETRY_DELAY_SECONDS}"
+    fi
+  done
+  die "startup failed after ${STARTUP_RETRY_MAX} attempts"
+}
+
 run_coordinator_bash() {
  local script="$1"
  if [[ -n "${HECATE_COORDINATOR_RELAY}" ]]; then
@ -329,6 +395,35 @@ fi
  log "pass: startup-intent-guard"
 }

+run_drill_controlled_cycle() {
+  CURRENT_RESOURCES=()
+  ROLLBACK_FLUX_SUSPEND=0
+
+  if [[ "${EXECUTE}" -eq 0 ]]; then
+    log "plan: verify shutdown drill config exists on coordinator (${SHUTDOWN_CONFIG})"
+  else
+    run_coordinator_bash "[ -s '${SHUTDOWN_CONFIG}' ]" || die "shutdown drill config missing on coordinator: ${SHUTDOWN_CONFIG}"
+  fi
+
+  log "running controlled shutdown cycle (poweroff disabled config)"
+  run_hecate_shutdown "drill-controlled-cycle-shutdown"
+
+  log "running startup recovery cycle"
+  run_hecate_startup_with_retry "drill-controlled-cycle-startup"
+
+  log "verifying critical stack readiness after cycle"
+  wait_ready flux-system deployment source-controller 240s
+  wait_ready flux-system deployment kustomize-controller 240s
+  wait_ready flux-system deployment helm-controller 240s
+  wait_ready flux-system deployment notification-controller 240s
+  wait_ready vault statefulset vault 420s
+  wait_ready postgres statefulset postgres 420s
+  wait_ready gitea deployment gitea 300s
+  wait_ready_keycloak 420s
+  wait_ready maintenance deployment metis 300s
+  log "pass: controlled-cycle"
+}
+
 main() {
  need_cmd "${KUBECTL}"
  need_cmd ssh
@ -375,6 +470,9 @@ main() {
    startup-intent-guard)
      run_drill_startup_intent_guard
      ;;
+    controlled-cycle)
+      run_drill_controlled_cycle
+      ;;
    *)
      die "unknown drill: ${drill}"
      ;;