harden startup guards and etcd restore validation
This commit is contained in:
parent
437a6b62cd
commit
1935c5eb3f
@ -26,6 +26,8 @@ Key startup guards:
|
|||||||
- `--auto-peer-failover` makes peer hosts hand off startup to the coordinator first, then run local startup only if the coordinator is unreachable.
|
- `--auto-peer-failover` makes peer hosts hand off startup to the coordinator first, then run local startup only if the coordinator is unreachable.
|
||||||
- Startup is blocked while UPS is on battery by default (unless `--allow-on-battery` or `coordination.allow_startup_on_battery: true` is set).
|
- Startup is blocked while UPS is on battery by default (unless `--allow-on-battery` or `coordination.allow_startup_on_battery: true` is set).
|
||||||
- Startup is blocked when a shutdown intent is active (`/var/lib/hecate/intent.json`).
|
- Startup is blocked when a shutdown intent is active (`/var/lib/hecate/intent.json`).
|
||||||
|
- Stale shutdown intents are auto-cleared after `coordination.startup_guard_max_age_seconds`, so old outage residue cannot permanently deadlock startup.
|
||||||
|
- Startup checks configured `coordination.peer_hosts` intents to avoid peer/coordinator split-brain startup races.
|
||||||
- Startup waits for time sync in `strict` or `quorum` mode (`startup.time_sync_mode`, `startup.time_sync_quorum`).
|
- Startup waits for time sync in `strict` or `quorum` mode (`startup.time_sync_mode`, `startup.time_sync_quorum`).
|
||||||
- Startup can block until storage is healthy (`startup.require_storage_ready` + critical PVC checks).
|
- Startup can block until storage is healthy (`startup.require_storage_ready` + critical PVC checks).
|
||||||
- Startup can block until external probes pass (`startup.require_post_start_probes` + `startup.post_start_probes`).
|
- Startup can block until external probes pass (`startup.require_post_start_probes` + `startup.post_start_probes`).
|
||||||
@ -117,6 +119,7 @@ Power metrics:
|
|||||||
|
|
||||||
- Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
|
- Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
|
||||||
- Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other.
|
- Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other.
|
||||||
|
- In multi-instance setups, set `coordination.peer_hosts` on each host (for example `titan-db` <-> `titan-24`) so startup guards account for remote intent too.
|
||||||
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
|
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
|
||||||
- `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default.
|
- `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default.
|
||||||
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
|
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
|
||||||
@ -131,6 +134,7 @@ Power metrics:
|
|||||||
- `startup.auto_etcd_restore_on_api_failure: true`
|
- `startup.auto_etcd_restore_on_api_failure: true`
|
||||||
- `startup.etcd_restore_control_plane: <control-plane-node>`
|
- `startup.etcd_restore_control_plane: <control-plane-node>`
|
||||||
- If control planes are configured with `--datastore-endpoint` (external DB), Hecate will skip etcd restore and retry control-plane startup instead.
|
- If control planes are configured with `--datastore-endpoint` (external DB), Hecate will skip etcd restore and retry control-plane startup instead.
|
||||||
|
- Etcd restore now verifies snapshot existence, minimum size, listing presence, and SHA-256 before reset starts.
|
||||||
|
|
||||||
## Disruptive startup drills
|
## Disruptive startup drills
|
||||||
|
|
||||||
|
|||||||
@ -104,6 +104,7 @@ coordination:
|
|||||||
forward_shutdown_host: ""
|
forward_shutdown_host: ""
|
||||||
forward_shutdown_user: atlas
|
forward_shutdown_user: atlas
|
||||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||||
|
peer_hosts: []
|
||||||
fallback_local_shutdown: true
|
fallback_local_shutdown: true
|
||||||
command_timeout_seconds: 25
|
command_timeout_seconds: 25
|
||||||
startup_guard_max_age_seconds: 900
|
startup_guard_max_age_seconds: 900
|
||||||
|
|||||||
@ -168,6 +168,8 @@ coordination:
|
|||||||
forward_shutdown_host: titan-db
|
forward_shutdown_host: titan-db
|
||||||
forward_shutdown_user: atlas
|
forward_shutdown_user: atlas
|
||||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||||
|
peer_hosts:
|
||||||
|
- titan-db
|
||||||
fallback_local_shutdown: false
|
fallback_local_shutdown: false
|
||||||
command_timeout_seconds: 25
|
command_timeout_seconds: 25
|
||||||
startup_guard_max_age_seconds: 900
|
startup_guard_max_age_seconds: 900
|
||||||
|
|||||||
@ -169,6 +169,8 @@ coordination:
|
|||||||
forward_shutdown_host: ""
|
forward_shutdown_host: ""
|
||||||
forward_shutdown_user: atlas
|
forward_shutdown_user: atlas
|
||||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||||
|
peer_hosts:
|
||||||
|
- titan-24
|
||||||
fallback_local_shutdown: true
|
fallback_local_shutdown: true
|
||||||
command_timeout_seconds: 25
|
command_timeout_seconds: 25
|
||||||
startup_guard_max_age_seconds: 900
|
startup_guard_max_age_seconds: 900
|
||||||
|
|||||||
@ -116,7 +116,21 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
|||||||
currentIntent = state.Intent{State: state.IntentNormal}
|
currentIntent = state.Intent{State: state.IntentNormal}
|
||||||
}
|
}
|
||||||
if currentIntent.State == state.IntentShuttingDown {
|
if currentIntent.State == state.IntentShuttingDown {
|
||||||
return fmt.Errorf("startup blocked: shutdown intent is active (%s)", currentIntent.Reason)
|
if intentFresh(currentIntent, o.startupGuardAge()) {
|
||||||
|
return fmt.Errorf("startup blocked: shutdown intent is active (%s)", currentIntent.Reason)
|
||||||
|
}
|
||||||
|
o.log.Printf("warning: local shutdown intent appears stale (updated_at=%s reason=%q); auto-clearing to continue startup",
|
||||||
|
currentIntent.UpdatedAt.Format(time.RFC3339), currentIntent.Reason)
|
||||||
|
if clearErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentNormal, "auto-clear stale shutdown intent", "startup"); clearErr != nil {
|
||||||
|
return fmt.Errorf("clear stale shutdown intent: %w", clearErr)
|
||||||
|
}
|
||||||
|
currentIntent = state.Intent{State: state.IntentNormal}
|
||||||
|
}
|
||||||
|
if currentIntent.State == state.IntentShutdownComplete && intentFresh(currentIntent, 45*time.Second) {
|
||||||
|
return fmt.Errorf("startup blocked: shutdown completed too recently (%s ago)", intentAge(currentIntent).Round(time.Second))
|
||||||
|
}
|
||||||
|
if err := o.guardPeerStartupIntents(ctx); err != nil {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
if writeErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentStartupInProgress, opts.Reason, "startup"); writeErr != nil {
|
if writeErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentStartupInProgress, opts.Reason, "startup"); writeErr != nil {
|
||||||
return fmt.Errorf("set startup intent: %w", writeErr)
|
return fmt.Errorf("set startup intent: %w", writeErr)
|
||||||
@ -312,6 +326,9 @@ func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions)
|
|||||||
}
|
}
|
||||||
snapshotPath = resolved
|
snapshotPath = resolved
|
||||||
}
|
}
|
||||||
|
if err := o.verifyEtcdSnapshot(ctx, controlPlane, snapshotPath); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
o.log.Printf("etcd restore target=%s snapshot=%s", controlPlane, snapshotPath)
|
o.log.Printf("etcd restore target=%s snapshot=%s", controlPlane, snapshotPath)
|
||||||
|
|
||||||
for _, cp := range o.cfg.ControlPlanes {
|
for _, cp := range o.cfg.ControlPlanes {
|
||||||
@ -920,6 +937,151 @@ func parseSnapshotPathFromEtcdSnapshotList(out string) string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func intentAge(in state.Intent) time.Duration {
|
||||||
|
if in.UpdatedAt.IsZero() {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return time.Since(in.UpdatedAt)
|
||||||
|
}
|
||||||
|
|
||||||
|
func intentFresh(in state.Intent, maxAge time.Duration) bool {
|
||||||
|
if in.UpdatedAt.IsZero() {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return intentAge(in) <= maxAge
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) startupGuardAge() time.Duration {
|
||||||
|
seconds := o.cfg.Coordination.StartupGuardMaxAgeSec
|
||||||
|
if seconds <= 0 {
|
||||||
|
seconds = 900
|
||||||
|
}
|
||||||
|
return time.Duration(seconds) * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) coordinationPeers() []string {
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
out := make([]string, 0, len(o.cfg.Coordination.PeerHosts)+1)
|
||||||
|
add := func(node string) {
|
||||||
|
node = strings.TrimSpace(node)
|
||||||
|
if node == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if _, ok := seen[node]; ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
seen[node] = struct{}{}
|
||||||
|
out = append(out, node)
|
||||||
|
}
|
||||||
|
for _, node := range o.cfg.Coordination.PeerHosts {
|
||||||
|
add(node)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(o.cfg.Coordination.ForwardShutdownHost) != "" {
|
||||||
|
add(o.cfg.Coordination.ForwardShutdownHost)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) guardPeerStartupIntents(ctx context.Context) error {
|
||||||
|
peers := o.coordinationPeers()
|
||||||
|
if len(peers) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
guardAge := o.startupGuardAge()
|
||||||
|
for _, peer := range peers {
|
||||||
|
intent, err := o.readRemoteIntent(ctx, peer)
|
||||||
|
if err != nil {
|
||||||
|
o.log.Printf("warning: peer startup guard skipped intent check for %s: %v", peer, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch intent.State {
|
||||||
|
case "", state.IntentNormal:
|
||||||
|
continue
|
||||||
|
case state.IntentShuttingDown:
|
||||||
|
if intentFresh(intent, guardAge) {
|
||||||
|
return fmt.Errorf("startup blocked: peer %s has active shutdown intent (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))
|
||||||
|
}
|
||||||
|
o.log.Printf("warning: peer %s shutdown intent appears stale; allowing startup", peer)
|
||||||
|
case state.IntentStartupInProgress:
|
||||||
|
if intentFresh(intent, guardAge) {
|
||||||
|
return fmt.Errorf("startup blocked: peer %s reports startup_in_progress (reason=%q age=%s)", peer, intent.Reason, intentAge(intent).Round(time.Second))
|
||||||
|
}
|
||||||
|
o.log.Printf("warning: peer %s startup intent appears stale; allowing startup", peer)
|
||||||
|
case state.IntentShutdownComplete:
|
||||||
|
if intentFresh(intent, 45*time.Second) {
|
||||||
|
return fmt.Errorf("startup blocked: peer %s completed shutdown too recently (age=%s)", peer, intentAge(intent).Round(time.Second))
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
o.log.Printf("warning: peer %s intent state %q is unknown; ignoring", peer, intent.State)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) readRemoteIntent(ctx context.Context, node string) (state.Intent, error) {
|
||||||
|
if !o.sshManaged(node) {
|
||||||
|
return state.Intent{}, fmt.Errorf("%s is not in ssh_managed_nodes", node)
|
||||||
|
}
|
||||||
|
out, err := o.ssh(ctx, node, "sudo -n sh -lc 'if [ -s /var/lib/hecate/intent.json ]; then cat /var/lib/hecate/intent.json; else echo \"{}\"; fi'")
|
||||||
|
if err != nil {
|
||||||
|
return state.Intent{}, err
|
||||||
|
}
|
||||||
|
start := strings.Index(out, "{")
|
||||||
|
end := strings.LastIndex(out, "}")
|
||||||
|
if start < 0 || end < start {
|
||||||
|
return state.Intent{}, fmt.Errorf("remote intent payload missing json object")
|
||||||
|
}
|
||||||
|
var in state.Intent
|
||||||
|
if err := json.Unmarshal([]byte(out[start:end+1]), &in); err != nil {
|
||||||
|
return state.Intent{}, fmt.Errorf("decode remote intent json: %w", err)
|
||||||
|
}
|
||||||
|
return in, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func shellQuote(v string) string {
|
||||||
|
return "'" + strings.ReplaceAll(v, "'", `'"'"'`) + "'"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) verifyEtcdSnapshot(ctx context.Context, node string, snapshotPath string) error {
|
||||||
|
if o.runner.DryRun {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
path := strings.TrimSpace(snapshotPath)
|
||||||
|
if path == "" {
|
||||||
|
return fmt.Errorf("etcd snapshot verification failed: snapshot path is empty")
|
||||||
|
}
|
||||||
|
quoted := shellQuote(path)
|
||||||
|
sizeOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'test -s %s && stat -c %%s %s'", quoted, quoted))
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("etcd snapshot verification failed for %s on %s: %w", path, node, err)
|
||||||
|
}
|
||||||
|
size, convErr := strconv.ParseInt(strings.TrimSpace(sizeOut), 10, 64)
|
||||||
|
if convErr != nil {
|
||||||
|
return fmt.Errorf("etcd snapshot verification failed for %s on %s: parse size %q: %w", path, node, strings.TrimSpace(sizeOut), convErr)
|
||||||
|
}
|
||||||
|
const minSnapshotBytes = int64(1 << 20) // 1 MiB sanity floor.
|
||||||
|
if size < minSnapshotBytes {
|
||||||
|
return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot too small (%d bytes)", path, node, size)
|
||||||
|
}
|
||||||
|
lsOut, err := o.runSudoK3S(ctx, node, "etcd-snapshot", "ls")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("etcd snapshot verification failed for %s on %s: list snapshots: %w", path, node, err)
|
||||||
|
}
|
||||||
|
if !strings.Contains(lsOut, path) && !strings.Contains(lsOut, filepath.Base(path)) {
|
||||||
|
return fmt.Errorf("etcd snapshot verification failed for %s on %s: snapshot is not present in k3s etcd-snapshot ls output", path, node)
|
||||||
|
}
|
||||||
|
sumOut, err := o.ssh(ctx, node, fmt.Sprintf("sudo -n sh -lc 'sha256sum %s | awk \"{print \\$1}\"'", quoted))
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("etcd snapshot verification failed for %s on %s: sha256: %w", path, node, err)
|
||||||
|
}
|
||||||
|
hash := strings.TrimSpace(sumOut)
|
||||||
|
if len(hash) != 64 {
|
||||||
|
return fmt.Errorf("etcd snapshot verification failed for %s on %s: invalid sha256 %q", path, node, hash)
|
||||||
|
}
|
||||||
|
o.log.Printf("etcd snapshot verified path=%s size_bytes=%d sha256=%s", path, size, hash[:12])
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error) {
|
func (o *Orchestrator) runSudoK3S(ctx context.Context, node string, args ...string) (string, error) {
|
||||||
k3sPaths := []string{
|
k3sPaths := []string{
|
||||||
"/usr/local/bin/k3s",
|
"/usr/local/bin/k3s",
|
||||||
|
|||||||
@ -93,14 +93,15 @@ type UPSTarget struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Coordination struct {
|
type Coordination struct {
|
||||||
ForwardShutdownHost string `yaml:"forward_shutdown_host"`
|
ForwardShutdownHost string `yaml:"forward_shutdown_host"`
|
||||||
ForwardShutdownUser string `yaml:"forward_shutdown_user"`
|
ForwardShutdownUser string `yaml:"forward_shutdown_user"`
|
||||||
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
|
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
|
||||||
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
|
PeerHosts []string `yaml:"peer_hosts"`
|
||||||
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
|
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
|
||||||
StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"`
|
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
|
||||||
Role string `yaml:"role"`
|
StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"`
|
||||||
AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"`
|
Role string `yaml:"role"`
|
||||||
|
AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Metrics struct {
|
type Metrics struct {
|
||||||
@ -247,6 +248,11 @@ func (c Config) Validate() error {
|
|||||||
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
|
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for _, peer := range c.Coordination.PeerHosts {
|
||||||
|
if strings.TrimSpace(peer) == "" {
|
||||||
|
return fmt.Errorf("config.coordination.peer_hosts entries must not be empty")
|
||||||
|
}
|
||||||
|
}
|
||||||
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
|
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
|
||||||
return fmt.Errorf("config.coordination.startup_guard_max_age_seconds must be > 0")
|
return fmt.Errorf("config.coordination.startup_guard_max_age_seconds must be > 0")
|
||||||
}
|
}
|
||||||
@ -352,6 +358,7 @@ func defaults() Config {
|
|||||||
},
|
},
|
||||||
Coordination: Coordination{
|
Coordination: Coordination{
|
||||||
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
|
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
|
||||||
|
PeerHosts: []string{},
|
||||||
FallbackLocalShutdown: true,
|
FallbackLocalShutdown: true,
|
||||||
CommandTimeoutSeconds: 25,
|
CommandTimeoutSeconds: 25,
|
||||||
StartupGuardMaxAgeSec: 900,
|
StartupGuardMaxAgeSec: 900,
|
||||||
@ -483,6 +490,9 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.Coordination.ForwardShutdownConfig == "" {
|
if c.Coordination.ForwardShutdownConfig == "" {
|
||||||
c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml"
|
c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml"
|
||||||
}
|
}
|
||||||
|
if c.Coordination.PeerHosts == nil {
|
||||||
|
c.Coordination.PeerHosts = []string{}
|
||||||
|
}
|
||||||
if c.Coordination.CommandTimeoutSeconds <= 0 {
|
if c.Coordination.CommandTimeoutSeconds <= 0 {
|
||||||
c.Coordination.CommandTimeoutSeconds = 25
|
c.Coordination.CommandTimeoutSeconds = 25
|
||||||
}
|
}
|
||||||
|
|||||||
@ -56,6 +56,14 @@ func TestValidateRejectsUnknownRole(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) {
|
||||||
|
cfg := defaults()
|
||||||
|
cfg.Coordination.PeerHosts = []string{"titan-24", " "}
|
||||||
|
if err := cfg.Validate(); err == nil {
|
||||||
|
t.Fatalf("expected validation error for empty peer_hosts entry")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
|
func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
|
||||||
cfg := defaults()
|
cfg := defaults()
|
||||||
cfg.Startup.EtcdRestoreControlPlane = "titan-missing"
|
cfg.Startup.EtcdRestoreControlPlane = "titan-missing"
|
||||||
|
|||||||
@ -204,6 +204,8 @@ migrate_hecate_config() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
local changed=0
|
local changed=0
|
||||||
|
local role_hint
|
||||||
|
role_hint="$(read_hecate_role)"
|
||||||
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/hecate.yaml"; then
|
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/hecate.yaml"; then
|
||||||
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/hecate.yaml"
|
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/hecate.yaml"
|
||||||
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/hecate.yaml"
|
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/hecate.yaml"
|
||||||
@ -226,6 +228,25 @@ migrate_hecate_config() {
|
|||||||
echo "[install] added coordination.startup_guard_max_age_seconds=900"
|
echo "[install] added coordination.startup_guard_max_age_seconds=900"
|
||||||
changed=1
|
changed=1
|
||||||
fi
|
fi
|
||||||
|
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/hecate.yaml"; then
|
||||||
|
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/hecate.yaml"; then
|
||||||
|
local peer_host
|
||||||
|
peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/hecate.yaml" 2>/dev/null || true)"
|
||||||
|
if [[ -n "${peer_host}" ]]; then
|
||||||
|
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/hecate.yaml"
|
||||||
|
echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/hecate.yaml"; then
|
||||||
|
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/hecate.yaml"
|
||||||
|
echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
|
||||||
|
changed=1
|
||||||
|
else
|
||||||
|
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/hecate.yaml"
|
||||||
|
echo "[install] added coordination.peer_hosts empty default"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
local default_restore_cp
|
local default_restore_cp
|
||||||
default_restore_cp="$(first_control_plane_name)"
|
default_restore_cp="$(first_control_plane_name)"
|
||||||
if [[ -z "${default_restore_cp}" ]]; then
|
if [[ -z "${default_restore_cp}" ]]; then
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user