From 1935c5eb3fd0e7001b56ab1c3729e23ebabcca95 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 5 Apr 2026 13:18:34 -0300 Subject: [PATCH] harden startup guards and etcd restore validation --- README.md | 4 + configs/hecate.example.yaml | 1 + configs/hecate.tethys.yaml | 2 + configs/hecate.titan-db.yaml | 2 + internal/cluster/orchestrator.go | 164 ++++++++++++++++++++++++++++++- internal/config/config.go | 26 +++-- internal/config/config_test.go | 8 ++ scripts/install.sh | 21 ++++ 8 files changed, 219 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index df1cc20..bdaa491 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,8 @@ Key startup guards: - `--auto-peer-failover` makes peer hosts hand off startup to the coordinator first, then run local startup only if the coordinator is unreachable. - Startup is blocked while UPS is on battery by default (unless `--allow-on-battery` or `coordination.allow_startup_on_battery: true` is set). - Startup is blocked when a shutdown intent is active (`/var/lib/hecate/intent.json`). +- Stale shutdown intents are auto-cleared after `coordination.startup_guard_max_age_seconds`, so old outage residue cannot permanently deadlock startup. +- Startup checks configured `coordination.peer_hosts` intents to avoid peer/coordinator split-brain startup races. - Startup waits for time sync in `strict` or `quorum` mode (`startup.time_sync_mode`, `startup.time_sync_quorum`). - Startup can block until storage is healthy (`startup.require_storage_ready` + critical PVC checks). - Startup can block until external probes pass (`startup.require_post_start_probes` + `startup.post_start_probes`). @@ -117,6 +119,7 @@ Power metrics: - Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set. - Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other. +- In multi-instance setups, set `coordination.peer_hosts` on each host (for example `titan-db` <-> `titan-24`) so startup guards account for remote intent too. - `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically. - `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default. - `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively. @@ -131,6 +134,7 @@ Power metrics: - `startup.auto_etcd_restore_on_api_failure: true` - `startup.etcd_restore_control_plane: ` - If control planes are configured with `--datastore-endpoint` (external DB), Hecate will skip etcd restore and retry control-plane startup instead. +- Etcd restore now verifies snapshot existence, minimum size, listing presence, and SHA-256 before reset starts. ## Disruptive startup drills diff --git a/configs/hecate.example.yaml b/configs/hecate.example.yaml index 7a8cd5b..98d5f82 100644 --- a/configs/hecate.example.yaml +++ b/configs/hecate.example.yaml @@ -104,6 +104,7 @@ coordination: forward_shutdown_host: "" forward_shutdown_user: atlas forward_shutdown_config: /etc/hecate/hecate.yaml + peer_hosts: [] fallback_local_shutdown: true command_timeout_seconds: 25 startup_guard_max_age_seconds: 900 diff --git a/configs/hecate.tethys.yaml b/configs/hecate.tethys.yaml index fda4ad2..079035a 100644 --- a/configs/hecate.tethys.yaml +++ b/configs/hecate.tethys.yaml @@ -168,6 +168,8 @@ coordination: forward_shutdown_host: titan-db forward_shutdown_user: atlas forward_shutdown_config: /etc/hecate/hecate.yaml + peer_hosts: + - titan-db fallback_local_shutdown: false command_timeout_seconds: 25 startup_guard_max_age_seconds: 900 diff --git a/configs/hecate.titan-db.yaml b/configs/hecate.titan-db.yaml index ff61fca..8ef16d2 100644 --- a/configs/hecate.titan-db.yaml +++ b/configs/hecate.titan-db.yaml @@ -169,6 +169,8 @@ coordination: forward_shutdown_host: "" forward_shutdown_user: atlas forward_shutdown_config: /etc/hecate/hecate.yaml + peer_hosts: + - titan-24 fallback_local_shutdown: true command_timeout_seconds: 25 startup_guard_max_age_seconds: 900 diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index a2bbd91..940cf50 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -116,7 +116,21 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er currentIntent = state.Intent{State: state.IntentNormal} } if currentIntent.State == state.IntentShuttingDown { - return fmt.Errorf("startup blocked: shutdown intent is active (%s)", currentIntent.Reason) + if intentFresh(currentIntent, o.startupGuardAge()) { + return fmt.Errorf("startup blocked: shutdown intent is active (%s)", currentIntent.Reason) + } + o.log.Printf("warning: local shutdown intent appears stale (updated_at=%s reason=%q); auto-clearing to continue startup", + currentIntent.UpdatedAt.Format(time.RFC3339), currentIntent.Reason) + if clearErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentNormal, "auto-clear stale shutdown intent", "startup"); clearErr != nil { + return fmt.Errorf("clear stale shutdown intent: %w", clearErr) + } + currentIntent = state.Intent{State: state.IntentNormal} + } + if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, 45*time.Second) { + return fmt.Errorf("startup blocked: shutdown completed too recently (%s ago)", intentAge(currentIntent).Round(time.Second)) + } + if err := o.guardPeerStartupIntents(ctx); err != nil { + return err } if writeErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentStartupInProgress, opts.Reason, "startup"); writeErr != nil { return fmt.Errorf("set startup intent: %w", writeErr) @@ -312,6 +326,9 @@ func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions) } snapshotPath = resolved } + if err := o.verifyEtcdSnapshot(ctx, controlPlane, snapshotPath); err != nil { + return err + } o.log.Printf("etcd restore target=%s snapshot=%s", controlPlane, snapshotPath) for _, cp := range o.cfg.ControlPlanes { @@ -920,6 +937,151 @@ func parseSnapshotPathFromEtcdSnapshotList(out string) string { return "" } +func intentAge(in state.Intent) time.Duration { + if in.UpdatedAt.IsZero() { + return 0 + } + return time.Since(in.UpdatedAt) +} + +func intentFresh(in state.Intent, maxAge time.Duration) bool { + if in.UpdatedAt.IsZero() { + return true + } + return intentAge(in) <= maxAge +} + +func (o *Orchestrator) startupGuardAge() time.Duration { + seconds := o.cfg.Coordination.StartupGuardMaxAgeSec + if seconds <= 0 { + seconds = 900 + } + return time.Duration(seconds) * time.Second +} + +func (o *Orchestrator) coordinationPeers() []string { + seen := map[string]struct{}{} + out := make([]string, 0, len(o.cfg.Coordination.PeerHosts)+1) + add := func(node string) { + node = strings.TrimSpace(node) + if node == "" { + return + } + if _, ok := seen[node]; ok { + return + } + seen[node] = struct{}{} + out = append(out, node) + } + for _, node := range o.cfg.Coordination.PeerHosts { + add(node) + } + if strings.TrimSpace(o.cfg.Coordination.ForwardShutdownHost) != "" { + add(o.cfg.Coordination.ForwardShutdownHost) + } + return out +} + +func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error { + peers := o.coordinationPeers() + if len(peers) == 0 { + return nil + } + guardAge := o.startupGuardAge() + for _, peer := range peers { + intent, err := o.readRemoteIntent(ctx, peer) + if err != nil { + o.log.Printf("warning: peer startup guard skipped intent check for %s: %v", peer, err) + continue + } + switch intent.State { + case "", state.IntentNormal: + continue + case state.IntentShuttingDown: + if intentFresh(intent, guardAge) { + return fmt.Errorf("startup blocked: peer %s has active shutdown intent (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second)) + } + o.log.Printf("warning: peer %s shutdown intent appears stale; allowing startup", peer) + case state.IntentStartupInProgress: + if intentFresh(intent, guardAge) { + return fmt.Errorf("startup blocked: peer %s reports startup_in_progress (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second)) + } + o.log.Printf("warning: peer %s startup intent appears stale; allowing startup", peer) + case state.IntentShutdownComplete: + if intentFresh(intent, 45*time.Second) { + return fmt.Errorf("startup blocked: peer %s completed shutdown too recently (age=%s)", peer, intentAge(intent).Round(time.Second)) + } + default: + o.log.Printf("warning: peer %s intent state %q is unknown; ignoring", peer, intent.State) + } + } + return nil +} + +func (o *Orchestrator) readRemoteIntent(ctx context.Context, node string) (state.Intent, error) { + if !o.sshManaged(node) { + return state.Intent{}, fmt.Errorf("%s is not in ssh_managed_nodes", node) + } + out, err := o.ssh(ctx, node, "sudo -n sh -lc 'if [ -s /var/lib/hecate/intent.json ]; then cat /var/lib/hecate/intent.json; else echo \"{}\"; fi'") + if err != nil { + return state.Intent{}, err + } + start := strings.Index(out, "{") + end := strings.LastIndex(out, "}") + if start < 0 || end < start { + return state.Intent{}, fmt.Errorf("remote intent payload missing json object") + } + var in state.Intent + if err := json.Unmarshal([]byte(out[start:end+1]), &in); err != nil { + return state.Intent{}, fmt.Errorf("decode remote intent json: %w", err) + } + return in, nil +} + +func shellQuote(v string) string { + return "'" + strings.ReplaceAll(v, "'", `'"'"'`) + "'" +} + +func (o *Orchestrator) verifyEtcdSnapshot(ctx context.Context, node string, snapshotPath string) error { + if o.runner.DryRun { + return nil + } + path := strings.TrimSpace(snapshotPath) + if path == "" { + return fmt.Errorf("etcd snapshot verification failed: snapshot path is empty") + } + quoted := shellQuote(path) + sizeOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'test -s %s && stat -c %%s %s'", quoted, quoted)) + if err != nil { + return fmt.Errorf("etcd snapshot verification failed for %s on %s: %w", path, node, err) + } + size, convErr := strconv.ParseInt(strings.TrimSpace(sizeOut), 10, 64) + if convErr != nil { + return fmt.Errorf("etcd snapshot verification failed for %s on %s: parse size %q: %w", path, node, strings.TrimSpace(sizeOut), convErr) + } + const minSnapshotBytes = int64(1 << 20) // 1 MiB sanity floor. + if size < minSnapshotBytes { + return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot too small (%d bytes)", path, node, size) + } + lsOut, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "ls") + if err != nil { + return fmt.Errorf("etcd snapshot verification failed for %s on %s: list snapshots: %w", path, node, err) + } + if !strings.Contains(lsOut, path) && !strings.Contains(lsOut, filepath.Base(path)) { + return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot is not present in k3s etcd-snapshot ls output", path, node) + } + sumOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'sha256sum %s | awk \"{print \\$1}\"'", quoted)) + if err != nil { + return fmt.Errorf("etcd snapshot verification failed for %s on %s: sha256: %w", path, node, err) + } + hash := strings.TrimSpace(sumOut) + if len(hash) != 64 { + return fmt.Errorf("etcd snapshot verification failed for %s on %s: invalid sha256 %q", path, node, hash) + } + o.log.Printf("etcd snapshot verified path=%s size_bytes=%d sha256=%s", path, size, hash[:12]) + return nil +} + func (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error) { k3sPaths := []string{ "/usr/local/bin/k3s", diff --git a/internal/config/config.go b/internal/config/config.go index f99cc8a..c23b241 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -93,14 +93,15 @@ type UPSTarget struct { } type Coordination struct { - ForwardShutdownHost string `yaml:"forward_shutdown_host"` - ForwardShutdownUser string `yaml:"forward_shutdown_user"` - ForwardShutdownConfig string `yaml:"forward_shutdown_config"` - FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"` - CommandTimeoutSeconds int `yaml:"command_timeout_seconds"` - StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"` - Role string `yaml:"role"` - AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"` + ForwardShutdownHost string `yaml:"forward_shutdown_host"` + ForwardShutdownUser string `yaml:"forward_shutdown_user"` + ForwardShutdownConfig string `yaml:"forward_shutdown_config"` + PeerHosts []string `yaml:"peer_hosts"` + FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"` + CommandTimeoutSeconds int `yaml:"command_timeout_seconds"` + StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"` + Role string `yaml:"role"` + AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"` } type Metrics struct { @@ -247,6 +248,11 @@ func (c Config) Validate() error { return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set") } } + for _, peer := range c.Coordination.PeerHosts { + if strings.TrimSpace(peer) == "" { + return fmt.Errorf("config.coordination.peer_hosts entries must not be empty") + } + } if c.Coordination.StartupGuardMaxAgeSec <= 0 { return fmt.Errorf("config.coordination.startup_guard_max_age_seconds must be > 0") } @@ -352,6 +358,7 @@ func defaults() Config { }, Coordination: Coordination{ ForwardShutdownConfig: "/etc/hecate/hecate.yaml", + PeerHosts: []string{}, FallbackLocalShutdown: true, CommandTimeoutSeconds: 25, StartupGuardMaxAgeSec: 900, @@ -483,6 +490,9 @@ func (c *Config) applyDefaults() { if c.Coordination.ForwardShutdownConfig == "" { c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml" } + if c.Coordination.PeerHosts == nil { + c.Coordination.PeerHosts = []string{} + } if c.Coordination.CommandTimeoutSeconds <= 0 { c.Coordination.CommandTimeoutSeconds = 25 } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 383e034..2487aeb 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -56,6 +56,14 @@ func TestValidateRejectsUnknownRole(t *testing.T) { } } +func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) { + cfg := defaults() + cfg.Coordination.PeerHosts = []string{"titan-24", " "} + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error for empty peer_hosts entry") + } +} + func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) { cfg := defaults() cfg.Startup.EtcdRestoreControlPlane = "titan-missing" diff --git a/scripts/install.sh b/scripts/install.sh index 111b3f5..4903860 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -204,6 +204,8 @@ migrate_hecate_config() { fi local changed=0 + local role_hint + role_hint="$(read_hecate_role)" if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/hecate.yaml"; then sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/hecate.yaml" echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/hecate.yaml" @@ -226,6 +228,25 @@ migrate_hecate_config() { echo "[install] added coordination.startup_guard_max_age_seconds=900" changed=1 fi + if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/hecate.yaml"; then + if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/hecate.yaml"; then + local peer_host + peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/hecate.yaml" 2>/dev/null || true)" + if [[ -n "${peer_host}" ]]; then + sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/hecate.yaml" + echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})" + changed=1 + fi + elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/hecate.yaml"; then + sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/hecate.yaml" + echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role" + changed=1 + else + sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/hecate.yaml" + echo "[install] added coordination.peer_hosts empty default" + changed=1 + fi + fi local default_restore_cp default_restore_cp="$(first_control_plane_name)" if [[ -z "${default_restore_cp}" ]]; then