harden startup guards and etcd restore validation

This commit is contained in:
Brad Stein 2026-04-05 13:18:34 -03:00
parent 437a6b62cd
commit 1935c5eb3f
8 changed files with 219 additions and 9 deletions

View File

@ -26,6 +26,8 @@ Key startup guards:
- `--auto-peer-failover` makes peer hosts hand off startup to the coordinator first, then run local startup only if the coordinator is unreachable.
- Startup is blocked while UPS is on battery by default (unless `--allow-on-battery` or `coordination.allow_startup_on_battery: true` is set).
- Startup is blocked when a shutdown intent is active (`/var/lib/hecate/intent.json`).
- Stale shutdown intents are auto-cleared after `coordination.startup_guard_max_age_seconds`, so old outage residue cannot permanently deadlock startup.
- Startup checks configured `coordination.peer_hosts` intents to avoid peer/coordinator split-brain startup races.
- Startup waits for time sync in `strict` or `quorum` mode (`startup.time_sync_mode`, `startup.time_sync_quorum`).
- Startup can block until storage is healthy (`startup.require_storage_ready` + critical PVC checks).
- Startup can block until external probes pass (`startup.require_post_start_probes` + `startup.post_start_probes`).
@ -117,6 +119,7 @@ Power metrics:
- Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
- Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other.
- In multi-instance setups, set `coordination.peer_hosts` on each host (for example `titan-db` <-> `titan-24`) so startup guards account for remote intent too.
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
- `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default.
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
@ -131,6 +134,7 @@ Power metrics:
- `startup.auto_etcd_restore_on_api_failure: true`
- `startup.etcd_restore_control_plane: <control-plane-node>`
- If control planes are configured with `--datastore-endpoint` (external DB), Hecate will skip etcd restore and retry control-plane startup instead.
- Etcd restore now verifies snapshot existence, minimum size, listing presence, and SHA-256 before reset starts.
## Disruptive startup drills

View File

@ -104,6 +104,7 @@ coordination:
forward_shutdown_host: ""
forward_shutdown_user: atlas
forward_shutdown_config: /etc/hecate/hecate.yaml
peer_hosts: []
fallback_local_shutdown: true
command_timeout_seconds: 25
startup_guard_max_age_seconds: 900

View File

@ -168,6 +168,8 @@ coordination:
forward_shutdown_host: titan-db
forward_shutdown_user: atlas
forward_shutdown_config: /etc/hecate/hecate.yaml
peer_hosts:
- titan-db
fallback_local_shutdown: false
command_timeout_seconds: 25
startup_guard_max_age_seconds: 900

View File

@ -169,6 +169,8 @@ coordination:
forward_shutdown_host: ""
forward_shutdown_user: atlas
forward_shutdown_config: /etc/hecate/hecate.yaml
peer_hosts:
- titan-24
fallback_local_shutdown: true
command_timeout_seconds: 25
startup_guard_max_age_seconds: 900

View File

@ -116,7 +116,21 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
currentIntent = state.Intent{State: state.IntentNormal}
}
if currentIntent.State == state.IntentShuttingDown {
return fmt.Errorf("startup blocked: shutdown intent is active (%s)", currentIntent.Reason)
if intentFresh(currentIntent, o.startupGuardAge()) {
return fmt.Errorf("startup blocked: shutdown intent is active (%s)", currentIntent.Reason)
}
o.log.Printf("warning: local shutdown intent appears stale (updated_at=%s reason=%q); auto-clearing to continue startup",
currentIntent.UpdatedAt.Format(time.RFC3339), currentIntent.Reason)
if clearErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentNormal, "auto-clear stale shutdown intent", "startup"); clearErr != nil {
return fmt.Errorf("clear stale shutdown intent: %w", clearErr)
}
currentIntent = state.Intent{State: state.IntentNormal}
}
if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, 45*time.Second) {
return fmt.Errorf("startup blocked: shutdown completed too recently (%s ago)", intentAge(currentIntent).Round(time.Second))
}
if err := o.guardPeerStartupIntents(ctx); err != nil {
return err
}
if writeErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentStartupInProgress, opts.Reason, "startup"); writeErr != nil {
return fmt.Errorf("set startup intent: %w", writeErr)
@ -312,6 +326,9 @@ func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions)
}
snapshotPath = resolved
}
if err := o.verifyEtcdSnapshot(ctx, controlPlane, snapshotPath); err != nil {
return err
}
o.log.Printf("etcd restore target=%s snapshot=%s", controlPlane, snapshotPath)
for _, cp := range o.cfg.ControlPlanes {
@ -920,6 +937,151 @@ func parseSnapshotPathFromEtcdSnapshotList(out string) string {
return ""
}
func intentAge(in state.Intent) time.Duration {
if in.UpdatedAt.IsZero() {
return 0
}
return time.Since(in.UpdatedAt)
}
func intentFresh(in state.Intent, maxAge time.Duration) bool {
if in.UpdatedAt.IsZero() {
return true
}
return intentAge(in) <= maxAge
}
func (o *Orchestrator) startupGuardAge() time.Duration {
seconds := o.cfg.Coordination.StartupGuardMaxAgeSec
if seconds <= 0 {
seconds = 900
}
return time.Duration(seconds) * time.Second
}
func (o *Orchestrator) coordinationPeers() []string {
seen := map[string]struct{}{}
out := make([]string, 0, len(o.cfg.Coordination.PeerHosts)+1)
add := func(node string) {
node = strings.TrimSpace(node)
if node == "" {
return
}
if _, ok := seen[node]; ok {
return
}
seen[node] = struct{}{}
out = append(out, node)
}
for _, node := range o.cfg.Coordination.PeerHosts {
add(node)
}
if strings.TrimSpace(o.cfg.Coordination.ForwardShutdownHost) != "" {
add(o.cfg.Coordination.ForwardShutdownHost)
}
return out
}
func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error {
peers := o.coordinationPeers()
if len(peers) == 0 {
return nil
}
guardAge := o.startupGuardAge()
for _, peer := range peers {
intent, err := o.readRemoteIntent(ctx, peer)
if err != nil {
o.log.Printf("warning: peer startup guard skipped intent check for %s: %v", peer, err)
continue
}
switch intent.State {
case "", state.IntentNormal:
continue
case state.IntentShuttingDown:
if intentFresh(intent, guardAge) {
return fmt.Errorf("startup blocked: peer %s has active shutdown intent (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))
}
o.log.Printf("warning: peer %s shutdown intent appears stale; allowing startup", peer)
case state.IntentStartupInProgress:
if intentFresh(intent, guardAge) {
return fmt.Errorf("startup blocked: peer %s reports startup_in_progress (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))
}
o.log.Printf("warning: peer %s startup intent appears stale; allowing startup", peer)
case state.IntentShutdownComplete:
if intentFresh(intent, 45*time.Second) {
return fmt.Errorf("startup blocked: peer %s completed shutdown too recently (age=%s)", peer, intentAge(intent).Round(time.Second))
}
default:
o.log.Printf("warning: peer %s intent state %q is unknown; ignoring", peer, intent.State)
}
}
return nil
}
func (o *Orchestrator) readRemoteIntent(ctx context.Context, node string) (state.Intent, error) {
if !o.sshManaged(node) {
return state.Intent{}, fmt.Errorf("%s is not in ssh_managed_nodes", node)
}
out, err := o.ssh(ctx, node, "sudo -n sh -lc 'if [ -s /var/lib/hecate/intent.json ]; then cat /var/lib/hecate/intent.json; else echo \"{}\"; fi'")
if err != nil {
return state.Intent{}, err
}
start := strings.Index(out, "{")
end := strings.LastIndex(out, "}")
if start < 0 || end < start {
return state.Intent{}, fmt.Errorf("remote intent payload missing json object")
}
var in state.Intent
if err := json.Unmarshal([]byte(out[start:end+1]), &in); err != nil {
return state.Intent{}, fmt.Errorf("decode remote intent json: %w", err)
}
return in, nil
}
func shellQuote(v string) string {
return "'" + strings.ReplaceAll(v, "'", `'"'"'`) + "'"
}
func (o *Orchestrator) verifyEtcdSnapshot(ctx context.Context, node string, snapshotPath string) error {
if o.runner.DryRun {
return nil
}
path := strings.TrimSpace(snapshotPath)
if path == "" {
return fmt.Errorf("etcd snapshot verification failed: snapshot path is empty")
}
quoted := shellQuote(path)
sizeOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'test -s %s && stat -c %%s %s'", quoted, quoted))
if err != nil {
return fmt.Errorf("etcd snapshot verification failed for %s on %s: %w", path, node, err)
}
size, convErr := strconv.ParseInt(strings.TrimSpace(sizeOut), 10, 64)
if convErr != nil {
return fmt.Errorf("etcd snapshot verification failed for %s on %s: parse size %q: %w", path, node, strings.TrimSpace(sizeOut), convErr)
}
const minSnapshotBytes = int64(1 << 20) // 1 MiB sanity floor.
if size < minSnapshotBytes {
return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot too small (%d bytes)", path, node, size)
}
lsOut, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "ls")
if err != nil {
return fmt.Errorf("etcd snapshot verification failed for %s on %s: list snapshots: %w", path, node, err)
}
if !strings.Contains(lsOut, path) && !strings.Contains(lsOut, filepath.Base(path)) {
return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot is not present in k3s etcd-snapshot ls output", path, node)
}
sumOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'sha256sum %s | awk \"{print \\$1}\"'", quoted))
if err != nil {
return fmt.Errorf("etcd snapshot verification failed for %s on %s: sha256: %w", path, node, err)
}
hash := strings.TrimSpace(sumOut)
if len(hash) != 64 {
return fmt.Errorf("etcd snapshot verification failed for %s on %s: invalid sha256 %q", path, node, hash)
}
o.log.Printf("etcd snapshot verified path=%s size_bytes=%d sha256=%s", path, size, hash[:12])
return nil
}
func (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error) {
k3sPaths := []string{
"/usr/local/bin/k3s",

View File

@ -93,14 +93,15 @@ type UPSTarget struct {
}
type Coordination struct {
ForwardShutdownHost string `yaml:"forward_shutdown_host"`
ForwardShutdownUser string `yaml:"forward_shutdown_user"`
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"`
Role string `yaml:"role"`
AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"`
ForwardShutdownHost string `yaml:"forward_shutdown_host"`
ForwardShutdownUser string `yaml:"forward_shutdown_user"`
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
PeerHosts []string `yaml:"peer_hosts"`
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"`
Role string `yaml:"role"`
AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"`
}
type Metrics struct {
@ -247,6 +248,11 @@ func (c Config) Validate() error {
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
}
}
for _, peer := range c.Coordination.PeerHosts {
if strings.TrimSpace(peer) == "" {
return fmt.Errorf("config.coordination.peer_hosts entries must not be empty")
}
}
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
return fmt.Errorf("config.coordination.startup_guard_max_age_seconds must be > 0")
}
@ -352,6 +358,7 @@ func defaults() Config {
},
Coordination: Coordination{
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
PeerHosts: []string{},
FallbackLocalShutdown: true,
CommandTimeoutSeconds: 25,
StartupGuardMaxAgeSec: 900,
@ -483,6 +490,9 @@ func (c *Config) applyDefaults() {
if c.Coordination.ForwardShutdownConfig == "" {
c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml"
}
if c.Coordination.PeerHosts == nil {
c.Coordination.PeerHosts = []string{}
}
if c.Coordination.CommandTimeoutSeconds <= 0 {
c.Coordination.CommandTimeoutSeconds = 25
}

View File

@ -56,6 +56,14 @@ func TestValidateRejectsUnknownRole(t *testing.T) {
}
}
func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) {
cfg := defaults()
cfg.Coordination.PeerHosts = []string{"titan-24", " "}
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for empty peer_hosts entry")
}
}
func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
cfg := defaults()
cfg.Startup.EtcdRestoreControlPlane = "titan-missing"

View File

@ -204,6 +204,8 @@ migrate_hecate_config() {
fi
local changed=0
local role_hint
role_hint="$(read_hecate_role)"
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/hecate.yaml"; then
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/hecate.yaml"
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/hecate.yaml"
@ -226,6 +228,25 @@ migrate_hecate_config() {
echo "[install] added coordination.startup_guard_max_age_seconds=900"
changed=1
fi
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/hecate.yaml"; then
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/hecate.yaml"; then
local peer_host
peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/hecate.yaml" 2>/dev/null || true)"
if [[ -n "${peer_host}" ]]; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/hecate.yaml"
echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
changed=1
fi
elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/hecate.yaml"; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/hecate.yaml"
echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
changed=1
else
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/hecate.yaml"
echo "[install] added coordination.peer_hosts empty default"
changed=1
fi
fi
local default_restore_cp
default_restore_cp="$(first_control_plane_name)"
if [[ -z "${default_restore_cp}" ]]; then