harden startup guards and etcd restore validation
This commit is contained in:
parent
437a6b62cd
commit
1935c5eb3f
@ -26,6 +26,8 @@ Key startup guards:
|
||||
- `--auto-peer-failover` makes peer hosts hand off startup to the coordinator first, then run local startup only if the coordinator is unreachable.
|
||||
- Startup is blocked while UPS is on battery by default (unless `--allow-on-battery` or `coordination.allow_startup_on_battery: true` is set).
|
||||
- Startup is blocked when a shutdown intent is active (`/var/lib/hecate/intent.json`).
|
||||
- Stale shutdown intents are auto-cleared after `coordination.startup_guard_max_age_seconds`, so old outage residue cannot permanently deadlock startup.
|
||||
- Startup checks configured `coordination.peer_hosts` intents to avoid peer/coordinator split-brain startup races.
|
||||
- Startup waits for time sync in `strict` or `quorum` mode (`startup.time_sync_mode`, `startup.time_sync_quorum`).
|
||||
- Startup can block until storage is healthy (`startup.require_storage_ready` + critical PVC checks).
|
||||
- Startup can block until external probes pass (`startup.require_post_start_probes` + `startup.post_start_probes`).
|
||||
@ -117,6 +119,7 @@ Power metrics:
|
||||
|
||||
- Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
|
||||
- Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other.
|
||||
- In multi-instance setups, set `coordination.peer_hosts` on each host (for example `titan-db` <-> `titan-24`) so startup guards account for remote intent too.
|
||||
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
|
||||
- `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default.
|
||||
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
|
||||
@ -131,6 +134,7 @@ Power metrics:
|
||||
- `startup.auto_etcd_restore_on_api_failure: true`
|
||||
- `startup.etcd_restore_control_plane: <control-plane-node>`
|
||||
- If control planes are configured with `--datastore-endpoint` (external DB), Hecate will skip etcd restore and retry control-plane startup instead.
|
||||
- Etcd restore now verifies snapshot existence, minimum size, listing presence, and SHA-256 before reset starts.
|
||||
|
||||
## Disruptive startup drills
|
||||
|
||||
|
||||
@ -104,6 +104,7 @@ coordination:
|
||||
forward_shutdown_host: ""
|
||||
forward_shutdown_user: atlas
|
||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||
peer_hosts: []
|
||||
fallback_local_shutdown: true
|
||||
command_timeout_seconds: 25
|
||||
startup_guard_max_age_seconds: 900
|
||||
|
||||
@ -168,6 +168,8 @@ coordination:
|
||||
forward_shutdown_host: titan-db
|
||||
forward_shutdown_user: atlas
|
||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||
peer_hosts:
|
||||
- titan-db
|
||||
fallback_local_shutdown: false
|
||||
command_timeout_seconds: 25
|
||||
startup_guard_max_age_seconds: 900
|
||||
|
||||
@ -169,6 +169,8 @@ coordination:
|
||||
forward_shutdown_host: ""
|
||||
forward_shutdown_user: atlas
|
||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||
peer_hosts:
|
||||
- titan-24
|
||||
fallback_local_shutdown: true
|
||||
command_timeout_seconds: 25
|
||||
startup_guard_max_age_seconds: 900
|
||||
|
||||
@ -116,8 +116,22 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
currentIntent = state.Intent{State: state.IntentNormal}
|
||||
}
|
||||
if currentIntent.State == state.IntentShuttingDown {
|
||||
if intentFresh(currentIntent, o.startupGuardAge()) {
|
||||
return fmt.Errorf("startup blocked: shutdown intent is active (%s)", currentIntent.Reason)
|
||||
}
|
||||
o.log.Printf("warning: local shutdown intent appears stale (updated_at=%s reason=%q); auto-clearing to continue startup",
|
||||
currentIntent.UpdatedAt.Format(time.RFC3339), currentIntent.Reason)
|
||||
if clearErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentNormal, "auto-clear stale shutdown intent", "startup"); clearErr != nil {
|
||||
return fmt.Errorf("clear stale shutdown intent: %w", clearErr)
|
||||
}
|
||||
currentIntent = state.Intent{State: state.IntentNormal}
|
||||
}
|
||||
if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, 45*time.Second) {
|
||||
return fmt.Errorf("startup blocked: shutdown completed too recently (%s ago)", intentAge(currentIntent).Round(time.Second))
|
||||
}
|
||||
if err := o.guardPeerStartupIntents(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
if writeErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentStartupInProgress, opts.Reason, "startup"); writeErr != nil {
|
||||
return fmt.Errorf("set startup intent: %w", writeErr)
|
||||
}
|
||||
@ -312,6 +326,9 @@ func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions)
|
||||
}
|
||||
snapshotPath = resolved
|
||||
}
|
||||
if err := o.verifyEtcdSnapshot(ctx, controlPlane, snapshotPath); err != nil {
|
||||
return err
|
||||
}
|
||||
o.log.Printf("etcd restore target=%s snapshot=%s", controlPlane, snapshotPath)
|
||||
|
||||
for _, cp := range o.cfg.ControlPlanes {
|
||||
@ -920,6 +937,151 @@ func parseSnapshotPathFromEtcdSnapshotList(out string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func intentAge(in state.Intent) time.Duration {
|
||||
if in.UpdatedAt.IsZero() {
|
||||
return 0
|
||||
}
|
||||
return time.Since(in.UpdatedAt)
|
||||
}
|
||||
|
||||
func intentFresh(in state.Intent, maxAge time.Duration) bool {
|
||||
if in.UpdatedAt.IsZero() {
|
||||
return true
|
||||
}
|
||||
return intentAge(in) <= maxAge
|
||||
}
|
||||
|
||||
func (o *Orchestrator) startupGuardAge() time.Duration {
|
||||
seconds := o.cfg.Coordination.StartupGuardMaxAgeSec
|
||||
if seconds <= 0 {
|
||||
seconds = 900
|
||||
}
|
||||
return time.Duration(seconds) * time.Second
|
||||
}
|
||||
|
||||
func (o *Orchestrator) coordinationPeers() []string {
|
||||
seen := map[string]struct{}{}
|
||||
out := make([]string, 0, len(o.cfg.Coordination.PeerHosts)+1)
|
||||
add := func(node string) {
|
||||
node = strings.TrimSpace(node)
|
||||
if node == "" {
|
||||
return
|
||||
}
|
||||
if _, ok := seen[node]; ok {
|
||||
return
|
||||
}
|
||||
seen[node] = struct{}{}
|
||||
out = append(out, node)
|
||||
}
|
||||
for _, node := range o.cfg.Coordination.PeerHosts {
|
||||
add(node)
|
||||
}
|
||||
if strings.TrimSpace(o.cfg.Coordination.ForwardShutdownHost) != "" {
|
||||
add(o.cfg.Coordination.ForwardShutdownHost)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error {
|
||||
peers := o.coordinationPeers()
|
||||
if len(peers) == 0 {
|
||||
return nil
|
||||
}
|
||||
guardAge := o.startupGuardAge()
|
||||
for _, peer := range peers {
|
||||
intent, err := o.readRemoteIntent(ctx, peer)
|
||||
if err != nil {
|
||||
o.log.Printf("warning: peer startup guard skipped intent check for %s: %v", peer, err)
|
||||
continue
|
||||
}
|
||||
switch intent.State {
|
||||
case "", state.IntentNormal:
|
||||
continue
|
||||
case state.IntentShuttingDown:
|
||||
if intentFresh(intent, guardAge) {
|
||||
return fmt.Errorf("startup blocked: peer %s has active shutdown intent (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))
|
||||
}
|
||||
o.log.Printf("warning: peer %s shutdown intent appears stale; allowing startup", peer)
|
||||
case state.IntentStartupInProgress:
|
||||
if intentFresh(intent, guardAge) {
|
||||
return fmt.Errorf("startup blocked: peer %s reports startup_in_progress (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))
|
||||
}
|
||||
o.log.Printf("warning: peer %s startup intent appears stale; allowing startup", peer)
|
||||
case state.IntentShutdownComplete:
|
||||
if intentFresh(intent, 45*time.Second) {
|
||||
return fmt.Errorf("startup blocked: peer %s completed shutdown too recently (age=%s)", peer, intentAge(intent).Round(time.Second))
|
||||
}
|
||||
default:
|
||||
o.log.Printf("warning: peer %s intent state %q is unknown; ignoring", peer, intent.State)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) readRemoteIntent(ctx context.Context, node string) (state.Intent, error) {
|
||||
if !o.sshManaged(node) {
|
||||
return state.Intent{}, fmt.Errorf("%s is not in ssh_managed_nodes", node)
|
||||
}
|
||||
out, err := o.ssh(ctx, node, "sudo -n sh -lc 'if [ -s /var/lib/hecate/intent.json ]; then cat /var/lib/hecate/intent.json; else echo \"{}\"; fi'")
|
||||
if err != nil {
|
||||
return state.Intent{}, err
|
||||
}
|
||||
start := strings.Index(out, "{")
|
||||
end := strings.LastIndex(out, "}")
|
||||
if start < 0 || end < start {
|
||||
return state.Intent{}, fmt.Errorf("remote intent payload missing json object")
|
||||
}
|
||||
var in state.Intent
|
||||
if err := json.Unmarshal([]byte(out[start:end+1]), &in); err != nil {
|
||||
return state.Intent{}, fmt.Errorf("decode remote intent json: %w", err)
|
||||
}
|
||||
return in, nil
|
||||
}
|
||||
|
||||
func shellQuote(v string) string {
|
||||
return "'" + strings.ReplaceAll(v, "'", `'"'"'`) + "'"
|
||||
}
|
||||
|
||||
func (o *Orchestrator) verifyEtcdSnapshot(ctx context.Context, node string, snapshotPath string) error {
|
||||
if o.runner.DryRun {
|
||||
return nil
|
||||
}
|
||||
path := strings.TrimSpace(snapshotPath)
|
||||
if path == "" {
|
||||
return fmt.Errorf("etcd snapshot verification failed: snapshot path is empty")
|
||||
}
|
||||
quoted := shellQuote(path)
|
||||
sizeOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'test -s %s && stat -c %%s %s'", quoted, quoted))
|
||||
if err != nil {
|
||||
return fmt.Errorf("etcd snapshot verification failed for %s on %s: %w", path, node, err)
|
||||
}
|
||||
size, convErr := strconv.ParseInt(strings.TrimSpace(sizeOut), 10, 64)
|
||||
if convErr != nil {
|
||||
return fmt.Errorf("etcd snapshot verification failed for %s on %s: parse size %q: %w", path, node, strings.TrimSpace(sizeOut), convErr)
|
||||
}
|
||||
const minSnapshotBytes = int64(1 << 20) // 1 MiB sanity floor.
|
||||
if size < minSnapshotBytes {
|
||||
return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot too small (%d bytes)", path, node, size)
|
||||
}
|
||||
lsOut, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "ls")
|
||||
if err != nil {
|
||||
return fmt.Errorf("etcd snapshot verification failed for %s on %s: list snapshots: %w", path, node, err)
|
||||
}
|
||||
if !strings.Contains(lsOut, path) && !strings.Contains(lsOut, filepath.Base(path)) {
|
||||
return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot is not present in k3s etcd-snapshot ls output", path, node)
|
||||
}
|
||||
sumOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'sha256sum %s | awk \"{print \\$1}\"'", quoted))
|
||||
if err != nil {
|
||||
return fmt.Errorf("etcd snapshot verification failed for %s on %s: sha256: %w", path, node, err)
|
||||
}
|
||||
hash := strings.TrimSpace(sumOut)
|
||||
if len(hash) != 64 {
|
||||
return fmt.Errorf("etcd snapshot verification failed for %s on %s: invalid sha256 %q", path, node, hash)
|
||||
}
|
||||
o.log.Printf("etcd snapshot verified path=%s size_bytes=%d sha256=%s", path, size, hash[:12])
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error) {
|
||||
k3sPaths := []string{
|
||||
"/usr/local/bin/k3s",
|
||||
|
||||
@ -96,6 +96,7 @@ type Coordination struct {
|
||||
ForwardShutdownHost string `yaml:"forward_shutdown_host"`
|
||||
ForwardShutdownUser string `yaml:"forward_shutdown_user"`
|
||||
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
|
||||
PeerHosts []string `yaml:"peer_hosts"`
|
||||
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
|
||||
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
|
||||
StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"`
|
||||
@ -247,6 +248,11 @@ func (c Config) Validate() error {
|
||||
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
|
||||
}
|
||||
}
|
||||
for _, peer := range c.Coordination.PeerHosts {
|
||||
if strings.TrimSpace(peer) == "" {
|
||||
return fmt.Errorf("config.coordination.peer_hosts entries must not be empty")
|
||||
}
|
||||
}
|
||||
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
|
||||
return fmt.Errorf("config.coordination.startup_guard_max_age_seconds must be > 0")
|
||||
}
|
||||
@ -352,6 +358,7 @@ func defaults() Config {
|
||||
},
|
||||
Coordination: Coordination{
|
||||
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
|
||||
PeerHosts: []string{},
|
||||
FallbackLocalShutdown: true,
|
||||
CommandTimeoutSeconds: 25,
|
||||
StartupGuardMaxAgeSec: 900,
|
||||
@ -483,6 +490,9 @@ func (c *Config) applyDefaults() {
|
||||
if c.Coordination.ForwardShutdownConfig == "" {
|
||||
c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml"
|
||||
}
|
||||
if c.Coordination.PeerHosts == nil {
|
||||
c.Coordination.PeerHosts = []string{}
|
||||
}
|
||||
if c.Coordination.CommandTimeoutSeconds <= 0 {
|
||||
c.Coordination.CommandTimeoutSeconds = 25
|
||||
}
|
||||
|
||||
@ -56,6 +56,14 @@ func TestValidateRejectsUnknownRole(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Coordination.PeerHosts = []string{"titan-24", " "}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Fatalf("expected validation error for empty peer_hosts entry")
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.EtcdRestoreControlPlane = "titan-missing"
|
||||
|
||||
@ -204,6 +204,8 @@ migrate_hecate_config() {
|
||||
fi
|
||||
|
||||
local changed=0
|
||||
local role_hint
|
||||
role_hint="$(read_hecate_role)"
|
||||
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/hecate.yaml"; then
|
||||
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/hecate.yaml"
|
||||
@ -226,6 +228,25 @@ migrate_hecate_config() {
|
||||
echo "[install] added coordination.startup_guard_max_age_seconds=900"
|
||||
changed=1
|
||||
fi
|
||||
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/hecate.yaml"; then
|
||||
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/hecate.yaml"; then
|
||||
local peer_host
|
||||
peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/hecate.yaml" 2>/dev/null || true)"
|
||||
if [[ -n "${peer_host}" ]]; then
|
||||
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
|
||||
changed=1
|
||||
fi
|
||||
elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/hecate.yaml"; then
|
||||
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
|
||||
changed=1
|
||||
else
|
||||
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] added coordination.peer_hosts empty default"
|
||||
changed=1
|
||||
fi
|
||||
fi
|
||||
local default_restore_cp
|
||||
default_restore_cp="$(first_control_plane_name)"
|
||||
if [[ -z "${default_restore_cp}" ]]; then
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user