hecate: harden outage recovery startup and etcd restore

2026-04-04 20:50:58 -03:00 · 2026-04-04 20:50:58 -03:00 · 5d8bfd5de6
commit 5d8bfd5de6
parent 19562d77f7
9 changed files with 484 additions and 42 deletions
--- a/README.md
+++ b/README.md
@ -90,7 +90,7 @@ See `configs/hecate.example.yaml`.

 UPS auto-shutdown trigger uses:
 - runtime threshold = `runtime_safety_factor * estimated_shutdown_budget`
- default safety factor `1.10`
+- default safety factor `1.25`
 - debounce across multiple polls to avoid noise

 Estimated shutdown budget is derived from historical successful shutdown runs (`/var/lib/hecate/runs.json`) with default fallback from config.
@ -106,6 +106,16 @@ Power metrics:
 - `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
 - `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default.
 - `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
+- Peer startup fallback now checks coordinator intent/bootstrap activity before allowing local startup.
+- Automatic etcd recovery can run during startup if API never becomes reachable (`startup.auto_etcd_restore_on_api_failure`).
+
+## Etcd Recovery
+
+- Manual: `hecate etcd-restore --config /etc/hecate/hecate.yaml --execute`
+- Optional snapshot override: `--snapshot /var/lib/rancher/k3s/server/db/snapshots/<name>`
+- Startup can automatically invoke the same restore path after API timeout using:
+  - `startup.auto_etcd_restore_on_api_failure: true`
+  - `startup.etcd_restore_control_plane: <control-plane-node>`

 ## Disruptive startup drills

--- a/cmd/hecate/main.go
+++ b/cmd/hecate/main.go
@ -2,6 +2,7 @@ package main

 import (
 	"context"
+	"encoding/json"
 	"errors"
 	"flag"
 	"fmt"
@ -41,6 +42,11 @@ func main() {
 			logger.Printf("shutdown failed: %v", err)
 			os.Exit(1)
 		}
+	case "etcd-restore":
+		if err := runEtcdRestore(logger, os.Args[2:]); err != nil {
+			logger.Printf("etcd-restore failed: %v", err)
+			os.Exit(1)
+		}
 	case "daemon":
 		if err := runDaemon(logger, os.Args[2:]); err != nil {
 			logger.Printf("daemon failed: %v", err)
@ -96,6 +102,15 @@ func runStartup(logger *log.Logger, args []string) error {
 					logger.Printf("peer startup handoff complete; skipping local startup")
 					return nil
 				}
+				guardCtx, guardCancel := context.WithTimeout(context.Background(), time.Duration(maxInt(cfg.Coordination.CommandTimeoutSeconds, 15))*time.Second)
+				defer guardCancel()
+				allowed, guardReason, guardErr := coordinatorAllowsPeerFallbackStartup(guardCtx, cfg, logger)
+				if guardErr != nil {
+					return fmt.Errorf("startup blocked: unable to evaluate coordinator startup guard: %w", guardErr)
+				}
+				if !allowed {
+					return fmt.Errorf("startup blocked: coordinator guard disallowed peer fallback (%s)", guardReason)
+				}
 				logger.Printf("peer startup handoff unavailable; proceeding with local peer startup fallback")
 				allowPeer = true
 			} else {
@ -174,6 +189,26 @@ func runDaemon(logger *log.Logger, args []string) error {
 	return nil
 }

+func runEtcdRestore(logger *log.Logger, args []string) error {
+	fs := flag.NewFlagSet("etcd-restore", flag.ExitOnError)
+	configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
+	execute := fs.Bool("execute", false, "Actually execute restore (default dry-run)")
+	controlPlane := fs.String("control-plane", "", "Control plane to run restore on (defaults to startup.etcd_restore_control_plane)")
+	snapshotPath := fs.String("snapshot", "", "Explicit snapshot path (defaults to latest on selected control plane)")
+	_ = fs.Parse(args)
+
+	_, orch, err := buildOrchestrator(logger, *configPath, !*execute)
+	if err != nil {
+		return err
+	}
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	return orch.EtcdRestore(ctx, cluster.EtcdRestoreOptions{
+		ControlPlane: *controlPlane,
+		SnapshotPath: *snapshotPath,
+	})
+}
+
 func runStatus(logger *log.Logger, args []string) error {
 	fs := flag.NewFlagSet("status", flag.ExitOnError)
 	configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
@ -279,20 +314,61 @@ func buildUPSTargets(cfg config.Config) ([]service.Target, error) {
 }

 func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error {
-	onBatteryTargets := []string{}
+	type targetState struct {
+		seenGood bool
+		lastErr  error
+	}
+	states := make(map[string]*targetState, len(targets))
 	for _, t := range targets {
-		sample, err := t.Provider.Read(ctx)
-		if err != nil {
-			return fmt.Errorf("startup blocked: unable to verify UPS target %s (%s): %w", t.Name, t.Target, err)
+		key := t.Name + "|" + t.Target
+		states[key] = &targetState{}
+	}
+	const pollInterval = 3 * time.Second
+	for {
+		onBatteryTargets := []string{}
+		allSeen := true
+		for _, t := range targets {
+			key := t.Name + "|" + t.Target
+			st := states[key]
+			sample, err := t.Provider.Read(ctx)
+			if err != nil {
+				st.lastErr = err
+				if !st.seenGood {
+					allSeen = false
+				}
+				continue
+			}
+			st.seenGood = true
+			st.lastErr = nil
+			if sample.OnBattery {
+				onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds))
+			}
 		}
-		if sample.OnBattery {
-			onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds))
+		if len(onBatteryTargets) > 0 {
+			return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", "))
+		}
+		if allSeen {
+			return nil
+		}
+		select {
+		case <-ctx.Done():
+			unverified := make([]string, 0, len(targets))
+			for _, t := range targets {
+				key := t.Name + "|" + t.Target
+				st := states[key]
+				if st.seenGood {
+					continue
+				}
+				if st.lastErr != nil {
+					unverified = append(unverified, fmt.Sprintf("%s(%s): %v", t.Name, t.Target, st.lastErr))
+				} else {
+					unverified = append(unverified, fmt.Sprintf("%s(%s): no telemetry sample yet", t.Name, t.Target))
+				}
+			}
+			return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout: %s", strings.Join(unverified, " | "))
+		case <-time.After(pollInterval):
 		}
 	}
-	if len(onBatteryTargets) > 0 {
-		return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", "))
-	}
-	return nil
 }

 func buildOrchestrator(logger *log.Logger, cfgPath string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
@ -322,6 +398,7 @@ Usage:
 Commands:
  startup   Perform staged cluster startup
  shutdown  Perform graceful cluster shutdown
+  etcd-restore  Restore etcd from snapshot on a control plane
  daemon    Monitor UPS and auto-trigger shutdown
  status    Print current hecate status and estimates
  intent    Read or manually set intent state
@ -329,6 +406,7 @@ Commands:
 Examples:
  hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main
  hecate shutdown --config /etc/hecate/hecate.yaml --execute --reason "manual-maintenance"
+  hecate etcd-restore --config /etc/hecate/hecate.yaml --execute
  hecate daemon --config /etc/hecate/hecate.yaml
  hecate status --config /etc/hecate/hecate.yaml
  hecate intent --config /etc/hecate/hecate.yaml --set normal --reason "manual-clear" --execute
@ -358,30 +436,7 @@ func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log
 		target = user + "@" + host
 	}

-	args := []string{
-		"-o", "BatchMode=yes",
-		"-o", "ConnectTimeout=8",
-		"-o", "StrictHostKeyChecking=accept-new",
-	}
-	if cfgPath := resolveSSHConfigFile(cfg); cfgPath != "" {
-		args = append(args, "-F", cfgPath)
-	}
-	if idPath := resolveSSHIdentityFile(cfg); idPath != "" {
-		args = append(args, "-i", idPath)
-	}
-	if cfg.SSHPort > 0 {
-		args = append(args, "-p", strconv.Itoa(cfg.SSHPort))
-	}
-	if cfg.SSHJumpHost != "" {
-		jump := cfg.SSHJumpHost
-		if cfg.SSHJumpUser != "" {
-			jump = cfg.SSHJumpUser + "@" + jump
-		}
-		if cfg.SSHPort > 0 && !strings.Contains(jump, ":") {
-			jump = fmt.Sprintf("%s:%d", jump, cfg.SSHPort)
-		}
-		args = append(args, "-J", jump)
-	}
+	args := buildSSHBaseArgs(cfg)

 	remote := "sudo -n systemctl start hecate-bootstrap.service"
 	attempt := 1
@ -409,6 +464,116 @@ func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log
 	}
 }

+func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config, logger *log.Logger) (bool, string, error) {
+	coordinator := strings.TrimSpace(cfg.Coordination.ForwardShutdownHost)
+	if coordinator == "" {
+		return true, "no coordinator configured", nil
+	}
+	user := strings.TrimSpace(cfg.Coordination.ForwardShutdownUser)
+	if user == "" {
+		if override, ok := cfg.SSHNodeUsers[coordinator]; ok && strings.TrimSpace(override) != "" {
+			user = strings.TrimSpace(override)
+		} else {
+			user = strings.TrimSpace(cfg.SSHUser)
+		}
+	}
+	host := coordinator
+	if mapped, ok := cfg.SSHNodeHosts[coordinator]; ok && strings.TrimSpace(mapped) != "" {
+		host = strings.TrimSpace(mapped)
+	}
+	target := host
+	if user != "" {
+		target = user + "@" + host
+	}
+	remoteCmd := "sudo -n sh -lc 'if systemctl is-active --quiet hecate-bootstrap.service; then echo __HECATE_BOOTSTRAP_ACTIVE__; else echo __HECATE_BOOTSTRAP_IDLE__; fi; if [ -s /var/lib/hecate/intent.json ]; then cat /var/lib/hecate/intent.json; else echo \"{}\"; fi'"
+	args := append(buildSSHBaseArgs(cfg), target, remoteCmd)
+	cmd := exec.CommandContext(ctx, "ssh", args...)
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		trimmed := strings.TrimSpace(string(out))
+		if trimmed == "" {
+			logger.Printf("warning: coordinator guard check unavailable on %s: %v; allowing peer fallback startup", coordinator, err)
+		} else {
+			logger.Printf("warning: coordinator guard check unavailable on %s: %v: %s; allowing peer fallback startup", coordinator, err, trimmed)
+		}
+		return true, "coordinator unreachable", nil
+	}
+	trimmed := strings.TrimSpace(string(out))
+	if strings.Contains(trimmed, "__HECATE_BOOTSTRAP_ACTIVE__") {
+		return false, "coordinator bootstrap service is active", nil
+	}
+	start := strings.Index(trimmed, "{")
+	end := strings.LastIndex(trimmed, "}")
+	if start < 0 || end < start {
+		return false, "coordinator intent payload missing", nil
+	}
+	rawIntent := trimmed[start : end+1]
+	var remoteIntent state.Intent
+	if err := json.Unmarshal([]byte(rawIntent), &remoteIntent); err != nil {
+		return false, "", fmt.Errorf("decode coordinator intent: %w", err)
+	}
+	if remoteIntent.State == "" || remoteIntent.State == state.IntentNormal {
+		return true, "coordinator intent is normal", nil
+	}
+	guardAge := time.Duration(maxInt(cfg.Coordination.StartupGuardMaxAgeSec, 60)) * time.Second
+	intentAge := time.Duration(0)
+	if !remoteIntent.UpdatedAt.IsZero() {
+		intentAge = time.Since(remoteIntent.UpdatedAt)
+	}
+	switch remoteIntent.State {
+	case state.IntentShuttingDown:
+		if remoteIntent.UpdatedAt.IsZero() || intentAge <= guardAge {
+			return false, fmt.Sprintf("coordinator intent=%s age=%s reason=%q", remoteIntent.State, intentAge.Round(time.Second), remoteIntent.Reason), nil
+		}
+		logger.Printf("warning: coordinator shutdown intent appears stale (age=%s > guard=%s); allowing peer fallback startup", intentAge.Round(time.Second), guardAge)
+		return true, "coordinator shutdown intent stale", nil
+	case state.IntentStartupInProgress:
+		if remoteIntent.UpdatedAt.IsZero() || intentAge <= guardAge {
+			return false, fmt.Sprintf("coordinator intent=%s age=%s reason=%q", remoteIntent.State, intentAge.Round(time.Second), remoteIntent.Reason), nil
+		}
+		logger.Printf("warning: coordinator startup intent appears stale (age=%s > guard=%s); allowing peer fallback startup", intentAge.Round(time.Second), guardAge)
+		return true, "coordinator startup intent stale", nil
+	case state.IntentShutdownComplete:
+		if remoteIntent.UpdatedAt.IsZero() {
+			return false, "coordinator reported shutdown_complete with unknown age", nil
+		}
+		if intentAge <= 45*time.Second {
+			return false, fmt.Sprintf("coordinator recently completed shutdown (%s ago)", intentAge.Round(time.Second)), nil
+		}
+		return true, "coordinator shutdown_complete is old enough", nil
+	default:
+		return false, fmt.Sprintf("coordinator intent state %q is unknown", remoteIntent.State), nil
+	}
+}
+
+func buildSSHBaseArgs(cfg config.Config) []string {
+	args := []string{
+		"-o", "BatchMode=yes",
+		"-o", "ConnectTimeout=8",
+		"-o", "StrictHostKeyChecking=accept-new",
+	}
+	if cfgPath := resolveSSHConfigFile(cfg); cfgPath != "" {
+		args = append(args, "-F", cfgPath)
+	}
+	if idPath := resolveSSHIdentityFile(cfg); idPath != "" {
+		args = append(args, "-i", idPath)
+	}
+	if cfg.SSHPort > 0 {
+		args = append(args, "-p", strconv.Itoa(cfg.SSHPort))
+	}
+	if cfg.SSHJumpHost != "" {
+		jump := cfg.SSHJumpHost
+		if cfg.SSHJumpUser != "" {
+			jump = cfg.SSHJumpUser + "@" + jump
+		}
+		if cfg.SSHPort > 0 && !strings.Contains(jump, ":") {
+			jump = fmt.Sprintf("%s:%d", jump, cfg.SSHPort)
+		}
+		args = append(args, "-J", jump)
+	}
+	return args
+}
+
 func resolveSSHConfigFile(cfg config.Config) string {
 	if strings.TrimSpace(cfg.SSHConfigFile) != "" {
 		return strings.TrimSpace(cfg.SSHConfigFile)
--- a/configs/hecate.example.yaml
+++ b/configs/hecate.example.yaml
@ -42,6 +42,8 @@ excluded_namespaces:
 startup:
  api_wait_seconds: 1200
  api_poll_seconds: 2
+  auto_etcd_restore_on_api_failure: true
+  etcd_restore_control_plane: titan-0a
 shutdown:
  default_budget_seconds: 1380
  skip_etcd_snapshot: false
@ -71,6 +73,7 @@ coordination:
  forward_shutdown_config: /etc/hecate/hecate.yaml
  fallback_local_shutdown: true
  command_timeout_seconds: 25
+  startup_guard_max_age_seconds: 900
  role: coordinator
  allow_startup_on_battery: false
 metrics:
--- a/configs/hecate.tethys.yaml
+++ b/configs/hecate.tethys.yaml
@ -35,11 +35,23 @@ ssh_managed_nodes:
  - titan-0a
  - titan-0b
  - titan-0c
+  - titan-04
+  - titan-05
+  - titan-06
+  - titan-07
+  - titan-08
+  - titan-09
+  - titan-10
+  - titan-11
  - titan-12
+  - titan-13
  - titan-14
  - titan-15
  - titan-17
  - titan-18
+  - titan-19
+  - titan-20
+  - titan-21
  - titan-22
  - titan-24
 ssh_jump_host: ""
@ -53,6 +65,15 @@ control_planes:
 workers: []
 local_bootstrap_paths:
  - infrastructure/core
+  - clusters/atlas/flux-system
+  - infrastructure/sources/helm
+  - infrastructure/metallb
+  - infrastructure/traefik
+  - infrastructure/vault-csi
+  - infrastructure/vault-injector
+  - services/vault
+  - infrastructure/postgres
+  - services/gitea
 excluded_namespaces:
  - kube-system
  - kube-public
@ -68,6 +89,8 @@ excluded_namespaces:
 startup:
  api_wait_seconds: 1200
  api_poll_seconds: 2
+  auto_etcd_restore_on_api_failure: true
+  etcd_restore_control_plane: titan-0a
 shutdown:
  default_budget_seconds: 1380
  skip_etcd_snapshot: false
@ -95,6 +118,7 @@ coordination:
  forward_shutdown_config: /etc/hecate/hecate.yaml
  fallback_local_shutdown: false
  command_timeout_seconds: 25
+  startup_guard_max_age_seconds: 900
  role: peer
  allow_startup_on_battery: false
 metrics:
--- a/configs/hecate.titan-db.yaml
+++ b/configs/hecate.titan-db.yaml
@ -35,12 +35,25 @@ ssh_managed_nodes:
  - titan-0a
  - titan-0b
  - titan-0c
+  - titan-04
+  - titan-05
+  - titan-06
+  - titan-07
+  - titan-08
+  - titan-09
+  - titan-10
+  - titan-11
  - titan-12
+  - titan-13
  - titan-14
  - titan-15
  - titan-17
  - titan-18
+  - titan-19
+  - titan-20
+  - titan-21
  - titan-22
+  - titan-24
 ssh_jump_host: ""
 ssh_jump_user: ""
 iac_repo_path: /opt/titan-iac
@ -76,6 +89,8 @@ excluded_namespaces:
 startup:
  api_wait_seconds: 1200
  api_poll_seconds: 2
+  auto_etcd_restore_on_api_failure: true
+  etcd_restore_control_plane: titan-0a
 shutdown:
  default_budget_seconds: 1380
  skip_etcd_snapshot: false
@ -104,6 +119,7 @@ coordination:
  forward_shutdown_config: /etc/hecate/hecate.yaml
  fallback_local_shutdown: true
  command_timeout_seconds: 25
+  startup_guard_max_age_seconds: 900
  role: coordinator
  allow_startup_on_battery: false
 metrics:
--- a/internal/cluster/orchestrator.go
+++ b/internal/cluster/orchestrator.go
@ -39,6 +39,11 @@ type ShutdownOptions struct {
 	Reason           string
 }

+type EtcdRestoreOptions struct {
+	ControlPlane string
+	SnapshotPath string
+}
+
 type startupWorkload struct {
 	Namespace string
 	Kind      string
@ -121,7 +126,20 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
 		apiAttempts = 1
 	}
 	if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil {
-		return err
+		if !o.cfg.Startup.AutoEtcdRestoreOnAPIFailure {
+			return err
+		}
+		cp := strings.TrimSpace(o.cfg.Startup.EtcdRestoreControlPlane)
+		if cp == "" && len(o.cfg.ControlPlanes) > 0 {
+			cp = o.cfg.ControlPlanes[0]
+		}
+		o.log.Printf("warning: initial API wait failed (%v); attempting automatic etcd restore on %s", err, cp)
+		if restoreErr := o.EtcdRestore(ctx, EtcdRestoreOptions{ControlPlane: cp}); restoreErr != nil {
+			return fmt.Errorf("kubernetes API did not become reachable and automatic etcd restore failed: %w", restoreErr)
+		}
+		if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil {
+			return fmt.Errorf("kubernetes API did not become reachable after automatic etcd restore: %w", err)
+		}
 	}

 	workers, err := o.effectiveWorkers(ctx)
@ -200,6 +218,72 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
 	return nil
 }

+func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions) error {
+	controlPlane := strings.TrimSpace(opts.ControlPlane)
+	if controlPlane == "" {
+		if len(o.cfg.ControlPlanes) == 0 {
+			return fmt.Errorf("cannot restore etcd: no control planes configured")
+		}
+		controlPlane = o.cfg.ControlPlanes[0]
+	}
+	found := false
+	for _, cp := range o.cfg.ControlPlanes {
+		if cp == controlPlane {
+			found = true
+			break
+		}
+	}
+	if !found {
+		return fmt.Errorf("cannot restore etcd: control plane %s is not in configured control_planes", controlPlane)
+	}
+	if !o.sshManaged(controlPlane) {
+		return fmt.Errorf("cannot restore etcd on %s: node not in ssh_managed_nodes", controlPlane)
+	}
+
+	snapshotPath := strings.TrimSpace(opts.SnapshotPath)
+	if snapshotPath == "" {
+		resolved, err := o.latestEtcdSnapshotPath(ctx, controlPlane)
+		if err != nil {
+			return err
+		}
+		snapshotPath = resolved
+	}
+	o.log.Printf("etcd restore target=%s snapshot=%s", controlPlane, snapshotPath)
+	if o.runner.DryRun {
+		return nil
+	}
+
+	for _, cp := range o.cfg.ControlPlanes {
+		cp := cp
+		o.bestEffort("stop k3s before etcd restore on "+cp, func() error {
+			_, err := o.ssh(ctx, cp, "sudo systemctl stop k3s || true")
+			return err
+		})
+	}
+
+	restoreCmd := fmt.Sprintf("sudo k3s server --cluster-reset --cluster-reset-restore-path %q", snapshotPath)
+	if _, err := o.ssh(ctx, controlPlane, restoreCmd); err != nil {
+		return fmt.Errorf("etcd restore command failed on %s: %w", controlPlane, err)
+	}
+	o.log.Printf("etcd restore command completed on %s", controlPlane)
+
+	if _, err := o.ssh(ctx, controlPlane, "sudo systemctl start k3s || true"); err != nil {
+		return fmt.Errorf("failed to start k3s on restore control plane %s: %w", controlPlane, err)
+	}
+	time.Sleep(10 * time.Second)
+	for _, cp := range o.cfg.ControlPlanes {
+		cp := cp
+		if cp == controlPlane {
+			continue
+		}
+		o.bestEffort("start k3s after etcd restore on "+cp, func() error {
+			_, err := o.ssh(ctx, cp, "sudo systemctl start k3s || true")
+			return err
+		})
+	}
+	return nil
+}
+
 func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err error) {
 	unlock, err := state.AcquireLock(o.cfg.State.LockPath)
 	if err != nil {
@ -731,6 +815,22 @@ func (o *Orchestrator) takeEtcdSnapshot(ctx context.Context, node string) error
 	return err
 }

+func (o *Orchestrator) latestEtcdSnapshotPath(ctx context.Context, node string) (string, error) {
+	if !o.sshManaged(node) {
+		return "", fmt.Errorf("cannot resolve etcd snapshot on %s: node not in ssh_managed_nodes", node)
+	}
+	cmd := `sudo sh -lc 'ls -1t /var/lib/rancher/k3s/server/db/snapshots/* 2>/dev/null | head -n 1'`
+	out, err := o.ssh(ctx, node, cmd)
+	if err != nil {
+		return "", fmt.Errorf("resolve latest etcd snapshot on %s: %w", node, err)
+	}
+	snapshot := strings.TrimSpace(out)
+	if snapshot == "" {
+		return "", fmt.Errorf("no etcd snapshots found on %s under /var/lib/rancher/k3s/server/db/snapshots", node)
+	}
+	return snapshot, nil
+}
+
 func (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error {
 	if o.runner.DryRun {
 		return nil
--- a/internal/config/config.go
+++ b/internal/config/config.go
@ -33,8 +33,10 @@ type Config struct {
 }

 type Startup struct {
-	APIWaitSeconds int `yaml:"api_wait_seconds"`
-	APIPollSeconds int `yaml:"api_poll_seconds"`
+	APIWaitSeconds              int    `yaml:"api_wait_seconds"`
+	APIPollSeconds              int    `yaml:"api_poll_seconds"`
+	AutoEtcdRestoreOnAPIFailure bool   `yaml:"auto_etcd_restore_on_api_failure"`
+	EtcdRestoreControlPlane     string `yaml:"etcd_restore_control_plane"`
 }

 type Shutdown struct {
@ -72,6 +74,7 @@ type Coordination struct {
 	ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
 	FallbackLocalShutdown bool   `yaml:"fallback_local_shutdown"`
 	CommandTimeoutSeconds int    `yaml:"command_timeout_seconds"`
+	StartupGuardMaxAgeSec int    `yaml:"startup_guard_max_age_seconds"`
 	Role                  string `yaml:"role"`
 	AllowStartupOnBattery bool   `yaml:"allow_startup_on_battery"`
 }
@ -135,6 +138,18 @@ func (c Config) Validate() error {
 	if c.Startup.APIPollSeconds <= 0 {
 		return fmt.Errorf("config.startup.api_poll_seconds must be > 0")
 	}
+	if c.Startup.EtcdRestoreControlPlane != "" {
+		found := false
+		for _, cp := range c.ControlPlanes {
+			if cp == c.Startup.EtcdRestoreControlPlane {
+				found = true
+				break
+			}
+		}
+		if !found {
+			return fmt.Errorf("config.startup.etcd_restore_control_plane must be one of config.control_planes when set")
+		}
+	}
 	if c.SSHPort <= 0 || c.SSHPort > 65535 {
 		return fmt.Errorf("config.ssh_port must be in range 1-65535")
 	}
@ -156,6 +171,9 @@ func (c Config) Validate() error {
 			return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
 		}
 	}
+	if c.Coordination.StartupGuardMaxAgeSec <= 0 {
+		return fmt.Errorf("config.coordination.startup_guard_max_age_seconds must be > 0")
+	}
 	if c.Coordination.Role != "coordinator" && c.Coordination.Role != "peer" {
 		return fmt.Errorf("config.coordination.role must be coordinator or peer")
 	}
@ -200,8 +218,10 @@ func defaults() Config {
 			"maintenance",
 		},
 		Startup: Startup{
-			APIWaitSeconds: 1200,
-			APIPollSeconds: 2,
+			APIWaitSeconds:              1200,
+			APIPollSeconds:              2,
+			AutoEtcdRestoreOnAPIFailure: true,
+			EtcdRestoreControlPlane:     "titan-0a",
 		},
 		Shutdown: Shutdown{
 			DefaultBudgetSeconds: 1380,
@ -224,6 +244,7 @@ func defaults() Config {
 			ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
 			FallbackLocalShutdown: true,
 			CommandTimeoutSeconds: 25,
+			StartupGuardMaxAgeSec: 900,
 			Role:                  "coordinator",
 			AllowStartupOnBattery: false,
 		},
@ -256,6 +277,9 @@ func (c *Config) applyDefaults() {
 	if c.Startup.APIPollSeconds <= 0 {
 		c.Startup.APIPollSeconds = 2
 	}
+	if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 {
+		c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0]
+	}
 	if c.SSHPort <= 0 {
 		c.SSHPort = 2277
 	}
@ -292,6 +316,9 @@ func (c *Config) applyDefaults() {
 	if c.Coordination.CommandTimeoutSeconds <= 0 {
 		c.Coordination.CommandTimeoutSeconds = 25
 	}
+	if c.Coordination.StartupGuardMaxAgeSec <= 0 {
+		c.Coordination.StartupGuardMaxAgeSec = 900
+	}
 	if c.Coordination.Role == "" {
 		c.Coordination.Role = "coordinator"
 	}
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@ -55,3 +55,41 @@ func TestValidateRejectsUnknownRole(t *testing.T) {
 		t.Fatalf("expected validation error for unknown coordination role")
 	}
 }
+
+func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
+	cfg := defaults()
+	cfg.Startup.EtcdRestoreControlPlane = "titan-missing"
+	if err := cfg.Validate(); err == nil {
+		t.Fatalf("expected validation error for unknown etcd restore control plane")
+	}
+}
+
+func TestLoadSetsCoordinationGuardDefaults(t *testing.T) {
+	tmp := t.TempDir()
+	cfgPath := filepath.Join(tmp, "hecate.yaml")
+	raw := `
+control_planes: [titan-0a, titan-0b, titan-0c]
+expected_flux_branch: main
+iac_repo_path: /opt/titan-iac
+coordination:
+  role: coordinator
+ups:
+  enabled: false
+state:
+  run_history_path: /tmp/runs.json
+  lock_path: /tmp/hecate.lock
+`
+	if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
+		t.Fatalf("write config: %v", err)
+	}
+	cfg, err := Load(cfgPath)
+	if err != nil {
+		t.Fatalf("load config: %v", err)
+	}
+	if cfg.Coordination.StartupGuardMaxAgeSec <= 0 {
+		t.Fatalf("expected startup guard max age default > 0, got %d", cfg.Coordination.StartupGuardMaxAgeSec)
+	}
+	if cfg.Startup.EtcdRestoreControlPlane == "" {
+		t.Fatalf("expected startup etcd restore control plane default to be set")
+	}
+}
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -186,6 +186,23 @@ migrate_hecate_config() {
    echo "[install] migrated ssh_node_users titan-24 override to atlas"
    changed=1
  fi
+  if grep -Eq '^  command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml" \
+    && ! grep -Eq '^  startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml"; then
+    sed -Ei '/^  command_timeout_seconds:[[:space:]]*[0-9]+/a\  startup_guard_max_age_seconds: 900' "${CONF_DIR}/hecate.yaml"
+    echo "[install] added coordination.startup_guard_max_age_seconds=900"
+    changed=1
+  fi
+  local default_restore_cp
+  default_restore_cp="$(first_control_plane_name)"
+  if [[ -z "${default_restore_cp}" ]]; then
+    default_restore_cp="titan-0a"
+  fi
+  if grep -Eq '^  api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml" \
+    && ! grep -Eq '^  auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/hecate.yaml"; then
+    sed -Ei '/^  api_poll_seconds:[[:space:]]*[0-9]+/a\  auto_etcd_restore_on_api_failure: true\n  etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/hecate.yaml"
+    echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
+    changed=1
+  fi

  local role
  role="$(read_hecate_role)"
@ -221,12 +238,25 @@ migrate_hecate_config() {
  - titan-0a
  - titan-0b
  - titan-0c
+  - titan-04
+  - titan-05
+  - titan-06
+  - titan-07
+  - titan-08
+  - titan-09
+  - titan-10
+  - titan-11
  - titan-12
+  - titan-13
  - titan-14
  - titan-15
  - titan-17
  - titan-18
-  - titan-22'
+  - titan-19
+  - titan-20
+  - titan-21
+  - titan-22
+  - titan-24'
  elif [[ "${role}" == "peer" ]]; then
    inventory_block='ssh_node_hosts:
  titan-db: 192.168.22.10
@ -257,11 +287,23 @@ migrate_hecate_config() {
  - titan-0a
  - titan-0b
  - titan-0c
+  - titan-04
+  - titan-05
+  - titan-06
+  - titan-07
+  - titan-08
+  - titan-09
+  - titan-10
+  - titan-11
  - titan-12
+  - titan-13
  - titan-14
  - titan-15
  - titan-17
  - titan-18
+  - titan-19
+  - titan-20
+  - titan-21
  - titan-22
  - titan-24'
  fi
@ -280,6 +322,11 @@ migrate_hecate_config() {
      echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
      changed=1
    fi
+    if ! grep -Eq '^  - titan-04$' "${CONF_DIR}/hecate.yaml" || ! grep -Eq '^  - titan-21$' "${CONF_DIR}/hecate.yaml"; then
+      perl -0pi -e 's#ssh_managed_nodes:\n(?:  - .*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/hecate.yaml"
+      echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
+      changed=1
+    fi
  fi

  if [[ "${role}" == "peer" ]]; then
@ -287,10 +334,22 @@ migrate_hecate_config() {
      && grep -Eq '^  - titan-db$' "${CONF_DIR}/hecate.yaml" \
      && grep -Eq '^  - titan-24$' "${CONF_DIR}/hecate.yaml" \
      && ! grep -Eq '^  - titan-0a$' "${CONF_DIR}/hecate.yaml"; then
-      perl -0pi -e 's#ssh_managed_nodes:\n  - titan-db\n  - titan-24\n#ssh_managed_nodes:\n  - titan-db\n  - titan-0a\n  - titan-0b\n  - titan-0c\n  - titan-12\n  - titan-14\n  - titan-15\n  - titan-17\n  - titan-18\n  - titan-22\n  - titan-24\n#s' "${CONF_DIR}/hecate.yaml"
+      perl -0pi -e 's#ssh_managed_nodes:\n  - titan-db\n  - titan-24\n#ssh_managed_nodes:\n  - titan-db\n  - titan-0a\n  - titan-0b\n  - titan-0c\n  - titan-04\n  - titan-05\n  - titan-06\n  - titan-07\n  - titan-08\n  - titan-09\n  - titan-10\n  - titan-11\n  - titan-12\n  - titan-13\n  - titan-14\n  - titan-15\n  - titan-17\n  - titan-18\n  - titan-19\n  - titan-20\n  - titan-21\n  - titan-22\n  - titan-24\n#s' "${CONF_DIR}/hecate.yaml"
      echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
      changed=1
    fi
+
+    if ! grep -Eq '^  - services/gitea$' "${CONF_DIR}/hecate.yaml"; then
+      perl -0pi -e 's#local_bootstrap_paths:\n(?:  - .*\n)*#local_bootstrap_paths:\n  - infrastructure/core\n  - clusters/atlas/flux-system\n  - infrastructure/sources/helm\n  - infrastructure/metallb\n  - infrastructure/traefik\n  - infrastructure/vault-csi\n  - infrastructure/vault-injector\n  - services/vault\n  - infrastructure/postgres\n  - services/gitea\n#s' "${CONF_DIR}/hecate.yaml"
+      echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
+      changed=1
+    fi
+
+    if perl -0777 -ne 'exit(!(/local_bootstrap_paths:\n  - infrastructure\/core\n/s))' "${CONF_DIR}/hecate.yaml"; then
+      perl -0pi -e 's#local_bootstrap_paths:\n  - infrastructure/core\n#local_bootstrap_paths:\n  - infrastructure/core\n  - clusters/atlas/flux-system\n  - infrastructure/sources/helm\n  - infrastructure/metallb\n  - infrastructure/traefik\n  - infrastructure/vault-csi\n  - infrastructure/vault-injector\n  - services/vault\n  - infrastructure/postgres\n  - services/gitea\n#s' "${CONF_DIR}/hecate.yaml"
+      echo "[install] expanded peer local_bootstrap_paths for full fallback bootstrap parity"
+      changed=1
+    fi
  fi

  if [[ "${changed}" -eq 1 ]]; then