harden startup guards and etcd restore validation

2026-04-05 13:18:34 -03:00 · 2026-04-05 13:18:34 -03:00 · 1935c5eb3f
commit 1935c5eb3f
parent 437a6b62cd
8 changed files with 219 additions and 9 deletions
--- a/README.md
+++ b/README.md
@ -26,6 +26,8 @@ Key startup guards:
 - `--auto-peer-failover` makes peer hosts hand off startup to the coordinator first, then run local startup only if the coordinator is unreachable.
 - Startup is blocked while UPS is on battery by default (unless `--allow-on-battery` or `coordination.allow_startup_on_battery: true` is set).
 - Startup is blocked when a shutdown intent is active (`/var/lib/hecate/intent.json`).
+- Stale shutdown intents are auto-cleared after `coordination.startup_guard_max_age_seconds`, so old outage residue cannot permanently deadlock startup.
+- Startup checks configured `coordination.peer_hosts` intents to avoid peer/coordinator split-brain startup races.
 - Startup waits for time sync in `strict` or `quorum` mode (`startup.time_sync_mode`, `startup.time_sync_quorum`).
 - Startup can block until storage is healthy (`startup.require_storage_ready` + critical PVC checks).
 - Startup can block until external probes pass (`startup.require_post_start_probes` + `startup.post_start_probes`).
@ -117,6 +119,7 @@ Power metrics:

 - Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
 - Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other.
+- In multi-instance setups, set `coordination.peer_hosts` on each host (for example `titan-db` <-> `titan-24`) so startup guards account for remote intent too.
 - `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
 - `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default.
 - `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
@ -131,6 +134,7 @@ Power metrics:
  - `startup.auto_etcd_restore_on_api_failure: true`
  - `startup.etcd_restore_control_plane: <control-plane-node>`
 - If control planes are configured with `--datastore-endpoint` (external DB), Hecate will skip etcd restore and retry control-plane startup instead.
+- Etcd restore now verifies snapshot existence, minimum size, listing presence, and SHA-256 before reset starts.

 ## Disruptive startup drills

--- a/configs/hecate.example.yaml
+++ b/configs/hecate.example.yaml
@ -104,6 +104,7 @@ coordination:
  forward_shutdown_host: ""
  forward_shutdown_user: atlas
  forward_shutdown_config: /etc/hecate/hecate.yaml
+  peer_hosts: []
  fallback_local_shutdown: true
  command_timeout_seconds: 25
  startup_guard_max_age_seconds: 900
--- a/configs/hecate.tethys.yaml
+++ b/configs/hecate.tethys.yaml
@ -168,6 +168,8 @@ coordination:
  forward_shutdown_host: titan-db
  forward_shutdown_user: atlas
  forward_shutdown_config: /etc/hecate/hecate.yaml
+  peer_hosts:
+    - titan-db
  fallback_local_shutdown: false
  command_timeout_seconds: 25
  startup_guard_max_age_seconds: 900
--- a/configs/hecate.titan-db.yaml
+++ b/configs/hecate.titan-db.yaml
@ -169,6 +169,8 @@ coordination:
  forward_shutdown_host: ""
  forward_shutdown_user: atlas
  forward_shutdown_config: /etc/hecate/hecate.yaml
+  peer_hosts:
+    - titan-24
  fallback_local_shutdown: true
  command_timeout_seconds: 25
  startup_guard_max_age_seconds: 900
--- a/internal/cluster/orchestrator.go
+++ b/internal/cluster/orchestrator.go
@ -116,8 +116,22 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
 			currentIntent = state.Intent{State: state.IntentNormal}
 		}
 		if currentIntent.State == state.IntentShuttingDown {
+			if intentFresh(currentIntent, o.startupGuardAge()) {
 				return fmt.Errorf("startup blocked: shutdown intent is active (%s)", currentIntent.Reason)
 			}
+			o.log.Printf("warning: local shutdown intent appears stale (updated_at=%s reason=%q); auto-clearing to continue startup",
+				currentIntent.UpdatedAt.Format(time.RFC3339), currentIntent.Reason)
+			if clearErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentNormal, "auto-clear stale shutdown intent", "startup"); clearErr != nil {
+				return fmt.Errorf("clear stale shutdown intent: %w", clearErr)
+			}
+			currentIntent = state.Intent{State: state.IntentNormal}
+		}
+		if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, 45*time.Second) {
+			return fmt.Errorf("startup blocked: shutdown completed too recently (%s ago)", intentAge(currentIntent).Round(time.Second))
+		}
+		if err := o.guardPeerStartupIntents(ctx); err != nil {
+			return err
+		}
 		if writeErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentStartupInProgress, opts.Reason, "startup"); writeErr != nil {
 			return fmt.Errorf("set startup intent: %w", writeErr)
 		}
@ -312,6 +326,9 @@ func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions)
 		}
 		snapshotPath = resolved
 	}
+	if err := o.verifyEtcdSnapshot(ctx, controlPlane, snapshotPath); err != nil {
+		return err
+	}
 	o.log.Printf("etcd restore target=%s snapshot=%s", controlPlane, snapshotPath)

 	for _, cp := range o.cfg.ControlPlanes {
@ -920,6 +937,151 @@ func parseSnapshotPathFromEtcdSnapshotList(out string) string {
 	return ""
 }

+func intentAge(in state.Intent) time.Duration {
+	if in.UpdatedAt.IsZero() {
+		return 0
+	}
+	return time.Since(in.UpdatedAt)
+}
+
+func intentFresh(in state.Intent, maxAge time.Duration) bool {
+	if in.UpdatedAt.IsZero() {
+		return true
+	}
+	return intentAge(in) <= maxAge
+}
+
+func (o *Orchestrator) startupGuardAge() time.Duration {
+	seconds := o.cfg.Coordination.StartupGuardMaxAgeSec
+	if seconds <= 0 {
+		seconds = 900
+	}
+	return time.Duration(seconds) * time.Second
+}
+
+func (o *Orchestrator) coordinationPeers() []string {
+	seen := map[string]struct{}{}
+	out := make([]string, 0, len(o.cfg.Coordination.PeerHosts)+1)
+	add := func(node string) {
+		node = strings.TrimSpace(node)
+		if node == "" {
+			return
+		}
+		if _, ok := seen[node]; ok {
+			return
+		}
+		seen[node] = struct{}{}
+		out = append(out, node)
+	}
+	for _, node := range o.cfg.Coordination.PeerHosts {
+		add(node)
+	}
+	if strings.TrimSpace(o.cfg.Coordination.ForwardShutdownHost) != "" {
+		add(o.cfg.Coordination.ForwardShutdownHost)
+	}
+	return out
+}
+
+func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error {
+	peers := o.coordinationPeers()
+	if len(peers) == 0 {
+		return nil
+	}
+	guardAge := o.startupGuardAge()
+	for _, peer := range peers {
+		intent, err := o.readRemoteIntent(ctx, peer)
+		if err != nil {
+			o.log.Printf("warning: peer startup guard skipped intent check for %s: %v", peer, err)
+			continue
+		}
+		switch intent.State {
+		case "", state.IntentNormal:
+			continue
+		case state.IntentShuttingDown:
+			if intentFresh(intent, guardAge) {
+				return fmt.Errorf("startup blocked: peer %s has active shutdown intent (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))
+			}
+			o.log.Printf("warning: peer %s shutdown intent appears stale; allowing startup", peer)
+		case state.IntentStartupInProgress:
+			if intentFresh(intent, guardAge) {
+				return fmt.Errorf("startup blocked: peer %s reports startup_in_progress (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))
+			}
+			o.log.Printf("warning: peer %s startup intent appears stale; allowing startup", peer)
+		case state.IntentShutdownComplete:
+			if intentFresh(intent, 45*time.Second) {
+				return fmt.Errorf("startup blocked: peer %s completed shutdown too recently (age=%s)", peer, intentAge(intent).Round(time.Second))
+			}
+		default:
+			o.log.Printf("warning: peer %s intent state %q is unknown; ignoring", peer, intent.State)
+		}
+	}
+	return nil
+}
+
+func (o *Orchestrator) readRemoteIntent(ctx context.Context, node string) (state.Intent, error) {
+	if !o.sshManaged(node) {
+		return state.Intent{}, fmt.Errorf("%s is not in ssh_managed_nodes", node)
+	}
+	out, err := o.ssh(ctx, node, "sudo -n sh -lc 'if [ -s /var/lib/hecate/intent.json ]; then cat /var/lib/hecate/intent.json; else echo \"{}\"; fi'")
+	if err != nil {
+		return state.Intent{}, err
+	}
+	start := strings.Index(out, "{")
+	end := strings.LastIndex(out, "}")
+	if start < 0 || end < start {
+		return state.Intent{}, fmt.Errorf("remote intent payload missing json object")
+	}
+	var in state.Intent
+	if err := json.Unmarshal([]byte(out[start:end+1]), &in); err != nil {
+		return state.Intent{}, fmt.Errorf("decode remote intent json: %w", err)
+	}
+	return in, nil
+}
+
+func shellQuote(v string) string {
+	return "'" + strings.ReplaceAll(v, "'", `'"'"'`) + "'"
+}
+
+func (o *Orchestrator) verifyEtcdSnapshot(ctx context.Context, node string, snapshotPath string) error {
+	if o.runner.DryRun {
+		return nil
+	}
+	path := strings.TrimSpace(snapshotPath)
+	if path == "" {
+		return fmt.Errorf("etcd snapshot verification failed: snapshot path is empty")
+	}
+	quoted := shellQuote(path)
+	sizeOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'test -s %s && stat -c %%s %s'", quoted, quoted))
+	if err != nil {
+		return fmt.Errorf("etcd snapshot verification failed for %s on %s: %w", path, node, err)
+	}
+	size, convErr := strconv.ParseInt(strings.TrimSpace(sizeOut), 10, 64)
+	if convErr != nil {
+		return fmt.Errorf("etcd snapshot verification failed for %s on %s: parse size %q: %w", path, node, strings.TrimSpace(sizeOut), convErr)
+	}
+	const minSnapshotBytes = int64(1 << 20) // 1 MiB sanity floor.
+	if size < minSnapshotBytes {
+		return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot too small (%d bytes)", path, node, size)
+	}
+	lsOut, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "ls")
+	if err != nil {
+		return fmt.Errorf("etcd snapshot verification failed for %s on %s: list snapshots: %w", path, node, err)
+	}
+	if !strings.Contains(lsOut, path) && !strings.Contains(lsOut, filepath.Base(path)) {
+		return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot is not present in k3s etcd-snapshot ls output", path, node)
+	}
+	sumOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'sha256sum %s | awk \"{print \\$1}\"'", quoted))
+	if err != nil {
+		return fmt.Errorf("etcd snapshot verification failed for %s on %s: sha256: %w", path, node, err)
+	}
+	hash := strings.TrimSpace(sumOut)
+	if len(hash) != 64 {
+		return fmt.Errorf("etcd snapshot verification failed for %s on %s: invalid sha256 %q", path, node, hash)
+	}
+	o.log.Printf("etcd snapshot verified path=%s size_bytes=%d sha256=%s", path, size, hash[:12])
+	return nil
+}
+
 func (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error) {
 	k3sPaths := []string{
 		"/usr/local/bin/k3s",
--- a/internal/config/config.go
+++ b/internal/config/config.go
@ -96,6 +96,7 @@ type Coordination struct {
 	ForwardShutdownHost   string   `yaml:"forward_shutdown_host"`
 	ForwardShutdownUser   string   `yaml:"forward_shutdown_user"`
 	ForwardShutdownConfig string   `yaml:"forward_shutdown_config"`
+	PeerHosts             []string `yaml:"peer_hosts"`
 	FallbackLocalShutdown bool     `yaml:"fallback_local_shutdown"`
 	CommandTimeoutSeconds int      `yaml:"command_timeout_seconds"`
 	StartupGuardMaxAgeSec int      `yaml:"startup_guard_max_age_seconds"`
@ -247,6 +248,11 @@ func (c Config) Validate() error {
 			return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
 		}
 	}
+	for _, peer := range c.Coordination.PeerHosts {
+		if strings.TrimSpace(peer) == "" {
+			return fmt.Errorf("config.coordination.peer_hosts entries must not be empty")
+		}
+	}
 	if c.Coordination.StartupGuardMaxAgeSec <= 0 {
 		return fmt.Errorf("config.coordination.startup_guard_max_age_seconds must be > 0")
 	}
@ -352,6 +358,7 @@ func defaults() Config {
 		},
 		Coordination: Coordination{
 			ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
+			PeerHosts:             []string{},
 			FallbackLocalShutdown: true,
 			CommandTimeoutSeconds: 25,
 			StartupGuardMaxAgeSec: 900,
@ -483,6 +490,9 @@ func (c *Config) applyDefaults() {
 	if c.Coordination.ForwardShutdownConfig == "" {
 		c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml"
 	}
+	if c.Coordination.PeerHosts == nil {
+		c.Coordination.PeerHosts = []string{}
+	}
 	if c.Coordination.CommandTimeoutSeconds <= 0 {
 		c.Coordination.CommandTimeoutSeconds = 25
 	}
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@ -56,6 +56,14 @@ func TestValidateRejectsUnknownRole(t *testing.T) {
 	}
 }

+func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) {
+	cfg := defaults()
+	cfg.Coordination.PeerHosts = []string{"titan-24", " "}
+	if err := cfg.Validate(); err == nil {
+		t.Fatalf("expected validation error for empty peer_hosts entry")
+	}
+}
+
 func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.EtcdRestoreControlPlane = "titan-missing"
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -204,6 +204,8 @@ migrate_hecate_config() {
  fi

  local changed=0
+  local role_hint
+  role_hint="$(read_hecate_role)"
  if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/hecate.yaml"; then
    sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/hecate.yaml"
    echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/hecate.yaml"
@ -226,6 +228,25 @@ migrate_hecate_config() {
    echo "[install] added coordination.startup_guard_max_age_seconds=900"
    changed=1
  fi
+  if ! grep -Eq '^  peer_hosts:' "${CONF_DIR}/hecate.yaml"; then
+    if [[ "${role_hint}" == "peer" ]] && grep -Eq '^  forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/hecate.yaml"; then
+      local peer_host
+      peer_host="$(awk -F': *' '/^  forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/hecate.yaml" 2>/dev/null || true)"
+      if [[ -n "${peer_host}" ]]; then
+        sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts:\n    - '"${peer_host}"'' "${CONF_DIR}/hecate.yaml"
+        echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
+        changed=1
+      fi
+    elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^  titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/hecate.yaml"; then
+      sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts:\n    - titan-24' "${CONF_DIR}/hecate.yaml"
+      echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
+      changed=1
+    else
+      sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts: []' "${CONF_DIR}/hecate.yaml"
+      echo "[install] added coordination.peer_hosts empty default"
+      changed=1
+    fi
+  fi
  local default_restore_cp
  default_restore_cp="$(first_control_plane_name)"
  if [[ -z "${default_restore_cp}" ]]; then