hecate: harden outage recovery startup and etcd restore

This commit is contained in:
Brad Stein 2026-04-04 20:50:58 -03:00
parent 19562d77f7
commit 5d8bfd5de6
9 changed files with 484 additions and 42 deletions

View File

@ -90,7 +90,7 @@ See `configs/hecate.example.yaml`.
UPS auto-shutdown trigger uses:
- runtime threshold = `runtime_safety_factor * estimated_shutdown_budget`
- default safety factor `1.10`
- default safety factor `1.25`
- debounce across multiple polls to avoid noise
Estimated shutdown budget is derived from historical successful shutdown runs (`/var/lib/hecate/runs.json`) with default fallback from config.
@ -106,6 +106,16 @@ Power metrics:
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
- `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default.
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
- Peer startup fallback now checks coordinator intent/bootstrap activity before allowing local startup.
- Automatic etcd recovery can run during startup if API never becomes reachable (`startup.auto_etcd_restore_on_api_failure`).
## Etcd Recovery
- Manual: `hecate etcd-restore --config /etc/hecate/hecate.yaml --execute`
- Optional snapshot override: `--snapshot /var/lib/rancher/k3s/server/db/snapshots/<name>`
- Startup can automatically invoke the same restore path after API timeout using:
- `startup.auto_etcd_restore_on_api_failure: true`
- `startup.etcd_restore_control_plane: <control-plane-node>`
## Disruptive startup drills

View File

@ -2,6 +2,7 @@ package main
import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
@ -41,6 +42,11 @@ func main() {
logger.Printf("shutdown failed: %v", err)
os.Exit(1)
}
case "etcd-restore":
if err := runEtcdRestore(logger, os.Args[2:]); err != nil {
logger.Printf("etcd-restore failed: %v", err)
os.Exit(1)
}
case "daemon":
if err := runDaemon(logger, os.Args[2:]); err != nil {
logger.Printf("daemon failed: %v", err)
@ -96,6 +102,15 @@ func runStartup(logger *log.Logger, args []string) error {
logger.Printf("peer startup handoff complete; skipping local startup")
return nil
}
guardCtx, guardCancel := context.WithTimeout(context.Background(), time.Duration(maxInt(cfg.Coordination.CommandTimeoutSeconds, 15))*time.Second)
defer guardCancel()
allowed, guardReason, guardErr := coordinatorAllowsPeerFallbackStartup(guardCtx, cfg, logger)
if guardErr != nil {
return fmt.Errorf("startup blocked: unable to evaluate coordinator startup guard: %w", guardErr)
}
if !allowed {
return fmt.Errorf("startup blocked: coordinator guard disallowed peer fallback (%s)", guardReason)
}
logger.Printf("peer startup handoff unavailable; proceeding with local peer startup fallback")
allowPeer = true
} else {
@ -174,6 +189,26 @@ func runDaemon(logger *log.Logger, args []string) error {
return nil
}
func runEtcdRestore(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("etcd-restore", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
execute := fs.Bool("execute", false, "Actually execute restore (default dry-run)")
controlPlane := fs.String("control-plane", "", "Control plane to run restore on (defaults to startup.etcd_restore_control_plane)")
snapshotPath := fs.String("snapshot", "", "Explicit snapshot path (defaults to latest on selected control plane)")
_ = fs.Parse(args)
_, orch, err := buildOrchestrator(logger, *configPath, !*execute)
if err != nil {
return err
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
return orch.EtcdRestore(ctx, cluster.EtcdRestoreOptions{
ControlPlane: *controlPlane,
SnapshotPath: *snapshotPath,
})
}
func runStatus(logger *log.Logger, args []string) error {
fs := flag.NewFlagSet("status", flag.ExitOnError)
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
@ -279,20 +314,61 @@ func buildUPSTargets(cfg config.Config) ([]service.Target, error) {
}
func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error {
onBatteryTargets := []string{}
type targetState struct {
seenGood bool
lastErr error
}
states := make(map[string]*targetState, len(targets))
for _, t := range targets {
sample, err := t.Provider.Read(ctx)
if err != nil {
return fmt.Errorf("startup blocked: unable to verify UPS target %s (%s): %w", t.Name, t.Target, err)
key := t.Name + "|" + t.Target
states[key] = &targetState{}
}
const pollInterval = 3 * time.Second
for {
onBatteryTargets := []string{}
allSeen := true
for _, t := range targets {
key := t.Name + "|" + t.Target
st := states[key]
sample, err := t.Provider.Read(ctx)
if err != nil {
st.lastErr = err
if !st.seenGood {
allSeen = false
}
continue
}
st.seenGood = true
st.lastErr = nil
if sample.OnBattery {
onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds))
}
}
if sample.OnBattery {
onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds))
if len(onBatteryTargets) > 0 {
return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", "))
}
if allSeen {
return nil
}
select {
case <-ctx.Done():
unverified := make([]string, 0, len(targets))
for _, t := range targets {
key := t.Name + "|" + t.Target
st := states[key]
if st.seenGood {
continue
}
if st.lastErr != nil {
unverified = append(unverified, fmt.Sprintf("%s(%s): %v", t.Name, t.Target, st.lastErr))
} else {
unverified = append(unverified, fmt.Sprintf("%s(%s): no telemetry sample yet", t.Name, t.Target))
}
}
return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout: %s", strings.Join(unverified, " | "))
case <-time.After(pollInterval):
}
}
if len(onBatteryTargets) > 0 {
return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", "))
}
return nil
}
func buildOrchestrator(logger *log.Logger, cfgPath string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
@ -322,6 +398,7 @@ Usage:
Commands:
startup Perform staged cluster startup
shutdown Perform graceful cluster shutdown
etcd-restore Restore etcd from snapshot on a control plane
daemon Monitor UPS and auto-trigger shutdown
status Print current hecate status and estimates
intent Read or manually set intent state
@ -329,6 +406,7 @@ Commands:
Examples:
hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main
hecate shutdown --config /etc/hecate/hecate.yaml --execute --reason "manual-maintenance"
hecate etcd-restore --config /etc/hecate/hecate.yaml --execute
hecate daemon --config /etc/hecate/hecate.yaml
hecate status --config /etc/hecate/hecate.yaml
hecate intent --config /etc/hecate/hecate.yaml --set normal --reason "manual-clear" --execute
@ -358,30 +436,7 @@ func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log
target = user + "@" + host
}
args := []string{
"-o", "BatchMode=yes",
"-o", "ConnectTimeout=8",
"-o", "StrictHostKeyChecking=accept-new",
}
if cfgPath := resolveSSHConfigFile(cfg); cfgPath != "" {
args = append(args, "-F", cfgPath)
}
if idPath := resolveSSHIdentityFile(cfg); idPath != "" {
args = append(args, "-i", idPath)
}
if cfg.SSHPort > 0 {
args = append(args, "-p", strconv.Itoa(cfg.SSHPort))
}
if cfg.SSHJumpHost != "" {
jump := cfg.SSHJumpHost
if cfg.SSHJumpUser != "" {
jump = cfg.SSHJumpUser + "@" + jump
}
if cfg.SSHPort > 0 && !strings.Contains(jump, ":") {
jump = fmt.Sprintf("%s:%d", jump, cfg.SSHPort)
}
args = append(args, "-J", jump)
}
args := buildSSHBaseArgs(cfg)
remote := "sudo -n systemctl start hecate-bootstrap.service"
attempt := 1
@ -409,6 +464,116 @@ func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log
}
}
func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config, logger *log.Logger) (bool, string, error) {
coordinator := strings.TrimSpace(cfg.Coordination.ForwardShutdownHost)
if coordinator == "" {
return true, "no coordinator configured", nil
}
user := strings.TrimSpace(cfg.Coordination.ForwardShutdownUser)
if user == "" {
if override, ok := cfg.SSHNodeUsers[coordinator]; ok && strings.TrimSpace(override) != "" {
user = strings.TrimSpace(override)
} else {
user = strings.TrimSpace(cfg.SSHUser)
}
}
host := coordinator
if mapped, ok := cfg.SSHNodeHosts[coordinator]; ok && strings.TrimSpace(mapped) != "" {
host = strings.TrimSpace(mapped)
}
target := host
if user != "" {
target = user + "@" + host
}
remoteCmd := "sudo -n sh -lc 'if systemctl is-active --quiet hecate-bootstrap.service; then echo __HECATE_BOOTSTRAP_ACTIVE__; else echo __HECATE_BOOTSTRAP_IDLE__; fi; if [ -s /var/lib/hecate/intent.json ]; then cat /var/lib/hecate/intent.json; else echo \"{}\"; fi'"
args := append(buildSSHBaseArgs(cfg), target, remoteCmd)
cmd := exec.CommandContext(ctx, "ssh", args...)
out, err := cmd.CombinedOutput()
if err != nil {
trimmed := strings.TrimSpace(string(out))
if trimmed == "" {
logger.Printf("warning: coordinator guard check unavailable on %s: %v; allowing peer fallback startup", coordinator, err)
} else {
logger.Printf("warning: coordinator guard check unavailable on %s: %v: %s; allowing peer fallback startup", coordinator, err, trimmed)
}
return true, "coordinator unreachable", nil
}
trimmed := strings.TrimSpace(string(out))
if strings.Contains(trimmed, "__HECATE_BOOTSTRAP_ACTIVE__") {
return false, "coordinator bootstrap service is active", nil
}
start := strings.Index(trimmed, "{")
end := strings.LastIndex(trimmed, "}")
if start < 0 || end < start {
return false, "coordinator intent payload missing", nil
}
rawIntent := trimmed[start : end+1]
var remoteIntent state.Intent
if err := json.Unmarshal([]byte(rawIntent), &remoteIntent); err != nil {
return false, "", fmt.Errorf("decode coordinator intent: %w", err)
}
if remoteIntent.State == "" || remoteIntent.State == state.IntentNormal {
return true, "coordinator intent is normal", nil
}
guardAge := time.Duration(maxInt(cfg.Coordination.StartupGuardMaxAgeSec, 60)) * time.Second
intentAge := time.Duration(0)
if !remoteIntent.UpdatedAt.IsZero() {
intentAge = time.Since(remoteIntent.UpdatedAt)
}
switch remoteIntent.State {
case state.IntentShuttingDown:
if remoteIntent.UpdatedAt.IsZero() || intentAge <= guardAge {
return false, fmt.Sprintf("coordinator intent=%s age=%s reason=%q", remoteIntent.State, intentAge.Round(time.Second), remoteIntent.Reason), nil
}
logger.Printf("warning: coordinator shutdown intent appears stale (age=%s > guard=%s); allowing peer fallback startup", intentAge.Round(time.Second), guardAge)
return true, "coordinator shutdown intent stale", nil
case state.IntentStartupInProgress:
if remoteIntent.UpdatedAt.IsZero() || intentAge <= guardAge {
return false, fmt.Sprintf("coordinator intent=%s age=%s reason=%q", remoteIntent.State, intentAge.Round(time.Second), remoteIntent.Reason), nil
}
logger.Printf("warning: coordinator startup intent appears stale (age=%s > guard=%s); allowing peer fallback startup", intentAge.Round(time.Second), guardAge)
return true, "coordinator startup intent stale", nil
case state.IntentShutdownComplete:
if remoteIntent.UpdatedAt.IsZero() {
return false, "coordinator reported shutdown_complete with unknown age", nil
}
if intentAge <= 45*time.Second {
return false, fmt.Sprintf("coordinator recently completed shutdown (%s ago)", intentAge.Round(time.Second)), nil
}
return true, "coordinator shutdown_complete is old enough", nil
default:
return false, fmt.Sprintf("coordinator intent state %q is unknown", remoteIntent.State), nil
}
}
func buildSSHBaseArgs(cfg config.Config) []string {
args := []string{
"-o", "BatchMode=yes",
"-o", "ConnectTimeout=8",
"-o", "StrictHostKeyChecking=accept-new",
}
if cfgPath := resolveSSHConfigFile(cfg); cfgPath != "" {
args = append(args, "-F", cfgPath)
}
if idPath := resolveSSHIdentityFile(cfg); idPath != "" {
args = append(args, "-i", idPath)
}
if cfg.SSHPort > 0 {
args = append(args, "-p", strconv.Itoa(cfg.SSHPort))
}
if cfg.SSHJumpHost != "" {
jump := cfg.SSHJumpHost
if cfg.SSHJumpUser != "" {
jump = cfg.SSHJumpUser + "@" + jump
}
if cfg.SSHPort > 0 && !strings.Contains(jump, ":") {
jump = fmt.Sprintf("%s:%d", jump, cfg.SSHPort)
}
args = append(args, "-J", jump)
}
return args
}
func resolveSSHConfigFile(cfg config.Config) string {
if strings.TrimSpace(cfg.SSHConfigFile) != "" {
return strings.TrimSpace(cfg.SSHConfigFile)

View File

@ -42,6 +42,8 @@ excluded_namespaces:
startup:
api_wait_seconds: 1200
api_poll_seconds: 2
auto_etcd_restore_on_api_failure: true
etcd_restore_control_plane: titan-0a
shutdown:
default_budget_seconds: 1380
skip_etcd_snapshot: false
@ -71,6 +73,7 @@ coordination:
forward_shutdown_config: /etc/hecate/hecate.yaml
fallback_local_shutdown: true
command_timeout_seconds: 25
startup_guard_max_age_seconds: 900
role: coordinator
allow_startup_on_battery: false
metrics:

View File

@ -35,11 +35,23 @@ ssh_managed_nodes:
- titan-0a
- titan-0b
- titan-0c
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24
ssh_jump_host: ""
@ -53,6 +65,15 @@ control_planes:
workers: []
local_bootstrap_paths:
- infrastructure/core
- clusters/atlas/flux-system
- infrastructure/sources/helm
- infrastructure/metallb
- infrastructure/traefik
- infrastructure/vault-csi
- infrastructure/vault-injector
- services/vault
- infrastructure/postgres
- services/gitea
excluded_namespaces:
- kube-system
- kube-public
@ -68,6 +89,8 @@ excluded_namespaces:
startup:
api_wait_seconds: 1200
api_poll_seconds: 2
auto_etcd_restore_on_api_failure: true
etcd_restore_control_plane: titan-0a
shutdown:
default_budget_seconds: 1380
skip_etcd_snapshot: false
@ -95,6 +118,7 @@ coordination:
forward_shutdown_config: /etc/hecate/hecate.yaml
fallback_local_shutdown: false
command_timeout_seconds: 25
startup_guard_max_age_seconds: 900
role: peer
allow_startup_on_battery: false
metrics:

View File

@ -35,12 +35,25 @@ ssh_managed_nodes:
- titan-0a
- titan-0b
- titan-0c
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24
ssh_jump_host: ""
ssh_jump_user: ""
iac_repo_path: /opt/titan-iac
@ -76,6 +89,8 @@ excluded_namespaces:
startup:
api_wait_seconds: 1200
api_poll_seconds: 2
auto_etcd_restore_on_api_failure: true
etcd_restore_control_plane: titan-0a
shutdown:
default_budget_seconds: 1380
skip_etcd_snapshot: false
@ -104,6 +119,7 @@ coordination:
forward_shutdown_config: /etc/hecate/hecate.yaml
fallback_local_shutdown: true
command_timeout_seconds: 25
startup_guard_max_age_seconds: 900
role: coordinator
allow_startup_on_battery: false
metrics:

View File

@ -39,6 +39,11 @@ type ShutdownOptions struct {
Reason string
}
type EtcdRestoreOptions struct {
ControlPlane string
SnapshotPath string
}
type startupWorkload struct {
Namespace string
Kind string
@ -121,7 +126,20 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
apiAttempts = 1
}
if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil {
return err
if !o.cfg.Startup.AutoEtcdRestoreOnAPIFailure {
return err
}
cp := strings.TrimSpace(o.cfg.Startup.EtcdRestoreControlPlane)
if cp == "" && len(o.cfg.ControlPlanes) > 0 {
cp = o.cfg.ControlPlanes[0]
}
o.log.Printf("warning: initial API wait failed (%v); attempting automatic etcd restore on %s", err, cp)
if restoreErr := o.EtcdRestore(ctx, EtcdRestoreOptions{ControlPlane: cp}); restoreErr != nil {
return fmt.Errorf("kubernetes API did not become reachable and automatic etcd restore failed: %w", restoreErr)
}
if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil {
return fmt.Errorf("kubernetes API did not become reachable after automatic etcd restore: %w", err)
}
}
workers, err := o.effectiveWorkers(ctx)
@ -200,6 +218,72 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
return nil
}
func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions) error {
controlPlane := strings.TrimSpace(opts.ControlPlane)
if controlPlane == "" {
if len(o.cfg.ControlPlanes) == 0 {
return fmt.Errorf("cannot restore etcd: no control planes configured")
}
controlPlane = o.cfg.ControlPlanes[0]
}
found := false
for _, cp := range o.cfg.ControlPlanes {
if cp == controlPlane {
found = true
break
}
}
if !found {
return fmt.Errorf("cannot restore etcd: control plane %s is not in configured control_planes", controlPlane)
}
if !o.sshManaged(controlPlane) {
return fmt.Errorf("cannot restore etcd on %s: node not in ssh_managed_nodes", controlPlane)
}
snapshotPath := strings.TrimSpace(opts.SnapshotPath)
if snapshotPath == "" {
resolved, err := o.latestEtcdSnapshotPath(ctx, controlPlane)
if err != nil {
return err
}
snapshotPath = resolved
}
o.log.Printf("etcd restore target=%s snapshot=%s", controlPlane, snapshotPath)
if o.runner.DryRun {
return nil
}
for _, cp := range o.cfg.ControlPlanes {
cp := cp
o.bestEffort("stop k3s before etcd restore on "+cp, func() error {
_, err := o.ssh(ctx, cp, "sudo systemctl stop k3s || true")
return err
})
}
restoreCmd := fmt.Sprintf("sudo k3s server --cluster-reset --cluster-reset-restore-path %q", snapshotPath)
if _, err := o.ssh(ctx, controlPlane, restoreCmd); err != nil {
return fmt.Errorf("etcd restore command failed on %s: %w", controlPlane, err)
}
o.log.Printf("etcd restore command completed on %s", controlPlane)
if _, err := o.ssh(ctx, controlPlane, "sudo systemctl start k3s || true"); err != nil {
return fmt.Errorf("failed to start k3s on restore control plane %s: %w", controlPlane, err)
}
time.Sleep(10 * time.Second)
for _, cp := range o.cfg.ControlPlanes {
cp := cp
if cp == controlPlane {
continue
}
o.bestEffort("start k3s after etcd restore on "+cp, func() error {
_, err := o.ssh(ctx, cp, "sudo systemctl start k3s || true")
return err
})
}
return nil
}
func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err error) {
unlock, err := state.AcquireLock(o.cfg.State.LockPath)
if err != nil {
@ -731,6 +815,22 @@ func (o *Orchestrator) takeEtcdSnapshot(ctx context.Context, node string) error
return err
}
func (o *Orchestrator) latestEtcdSnapshotPath(ctx context.Context, node string) (string, error) {
if !o.sshManaged(node) {
return "", fmt.Errorf("cannot resolve etcd snapshot on %s: node not in ssh_managed_nodes", node)
}
cmd := `sudo sh -lc 'ls -1t /var/lib/rancher/k3s/server/db/snapshots/* 2>/dev/null | head -n 1'`
out, err := o.ssh(ctx, node, cmd)
if err != nil {
return "", fmt.Errorf("resolve latest etcd snapshot on %s: %w", node, err)
}
snapshot := strings.TrimSpace(out)
if snapshot == "" {
return "", fmt.Errorf("no etcd snapshots found on %s under /var/lib/rancher/k3s/server/db/snapshots", node)
}
return snapshot, nil
}
func (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error {
if o.runner.DryRun {
return nil

View File

@ -33,8 +33,10 @@ type Config struct {
}
type Startup struct {
APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"`
APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"`
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
}
type Shutdown struct {
@ -72,6 +74,7 @@ type Coordination struct {
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"`
Role string `yaml:"role"`
AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"`
}
@ -135,6 +138,18 @@ func (c Config) Validate() error {
if c.Startup.APIPollSeconds <= 0 {
return fmt.Errorf("config.startup.api_poll_seconds must be > 0")
}
if c.Startup.EtcdRestoreControlPlane != "" {
found := false
for _, cp := range c.ControlPlanes {
if cp == c.Startup.EtcdRestoreControlPlane {
found = true
break
}
}
if !found {
return fmt.Errorf("config.startup.etcd_restore_control_plane must be one of config.control_planes when set")
}
}
if c.SSHPort <= 0 || c.SSHPort > 65535 {
return fmt.Errorf("config.ssh_port must be in range 1-65535")
}
@ -156,6 +171,9 @@ func (c Config) Validate() error {
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
}
}
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
return fmt.Errorf("config.coordination.startup_guard_max_age_seconds must be > 0")
}
if c.Coordination.Role != "coordinator" && c.Coordination.Role != "peer" {
return fmt.Errorf("config.coordination.role must be coordinator or peer")
}
@ -200,8 +218,10 @@ func defaults() Config {
"maintenance",
},
Startup: Startup{
APIWaitSeconds: 1200,
APIPollSeconds: 2,
APIWaitSeconds: 1200,
APIPollSeconds: 2,
AutoEtcdRestoreOnAPIFailure: true,
EtcdRestoreControlPlane: "titan-0a",
},
Shutdown: Shutdown{
DefaultBudgetSeconds: 1380,
@ -224,6 +244,7 @@ func defaults() Config {
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
FallbackLocalShutdown: true,
CommandTimeoutSeconds: 25,
StartupGuardMaxAgeSec: 900,
Role: "coordinator",
AllowStartupOnBattery: false,
},
@ -256,6 +277,9 @@ func (c *Config) applyDefaults() {
if c.Startup.APIPollSeconds <= 0 {
c.Startup.APIPollSeconds = 2
}
if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 {
c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0]
}
if c.SSHPort <= 0 {
c.SSHPort = 2277
}
@ -292,6 +316,9 @@ func (c *Config) applyDefaults() {
if c.Coordination.CommandTimeoutSeconds <= 0 {
c.Coordination.CommandTimeoutSeconds = 25
}
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
c.Coordination.StartupGuardMaxAgeSec = 900
}
if c.Coordination.Role == "" {
c.Coordination.Role = "coordinator"
}

View File

@ -55,3 +55,41 @@ func TestValidateRejectsUnknownRole(t *testing.T) {
t.Fatalf("expected validation error for unknown coordination role")
}
}
func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
cfg := defaults()
cfg.Startup.EtcdRestoreControlPlane = "titan-missing"
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for unknown etcd restore control plane")
}
}
func TestLoadSetsCoordinationGuardDefaults(t *testing.T) {
tmp := t.TempDir()
cfgPath := filepath.Join(tmp, "hecate.yaml")
raw := `
control_planes: [titan-0a, titan-0b, titan-0c]
expected_flux_branch: main
iac_repo_path: /opt/titan-iac
coordination:
role: coordinator
ups:
enabled: false
state:
run_history_path: /tmp/runs.json
lock_path: /tmp/hecate.lock
`
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
t.Fatalf("write config: %v", err)
}
cfg, err := Load(cfgPath)
if err != nil {
t.Fatalf("load config: %v", err)
}
if cfg.Coordination.StartupGuardMaxAgeSec <= 0 {
t.Fatalf("expected startup guard max age default > 0, got %d", cfg.Coordination.StartupGuardMaxAgeSec)
}
if cfg.Startup.EtcdRestoreControlPlane == "" {
t.Fatalf("expected startup etcd restore control plane default to be set")
}
}

View File

@ -186,6 +186,23 @@ migrate_hecate_config() {
echo "[install] migrated ssh_node_users titan-24 override to atlas"
changed=1
fi
if grep -Eq '^ command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml" \
&& ! grep -Eq '^ startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml"; then
sed -Ei '/^ command_timeout_seconds:[[:space:]]*[0-9]+/a\ startup_guard_max_age_seconds: 900' "${CONF_DIR}/hecate.yaml"
echo "[install] added coordination.startup_guard_max_age_seconds=900"
changed=1
fi
local default_restore_cp
default_restore_cp="$(first_control_plane_name)"
if [[ -z "${default_restore_cp}" ]]; then
default_restore_cp="titan-0a"
fi
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml" \
&& ! grep -Eq '^ auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/hecate.yaml"; then
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ auto_etcd_restore_on_api_failure: true\n etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/hecate.yaml"
echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
changed=1
fi
local role
role="$(read_hecate_role)"
@ -221,12 +238,25 @@ migrate_hecate_config() {
- titan-0a
- titan-0b
- titan-0c
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-22'
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
elif [[ "${role}" == "peer" ]]; then
inventory_block='ssh_node_hosts:
titan-db: 192.168.22.10
@ -257,11 +287,23 @@ migrate_hecate_config() {
- titan-0a
- titan-0b
- titan-0c
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
fi
@ -280,6 +322,11 @@ migrate_hecate_config() {
echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
changed=1
fi
if ! grep -Eq '^ - titan-04$' "${CONF_DIR}/hecate.yaml" || ! grep -Eq '^ - titan-21$' "${CONF_DIR}/hecate.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n(?: - .*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/hecate.yaml"
echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
changed=1
fi
fi
if [[ "${role}" == "peer" ]]; then
@ -287,10 +334,22 @@ migrate_hecate_config() {
&& grep -Eq '^ - titan-db$' "${CONF_DIR}/hecate.yaml" \
&& grep -Eq '^ - titan-24$' "${CONF_DIR}/hecate.yaml" \
&& ! grep -Eq '^ - titan-0a$' "${CONF_DIR}/hecate.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-12\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/hecate.yaml"
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-04\n - titan-05\n - titan-06\n - titan-07\n - titan-08\n - titan-09\n - titan-10\n - titan-11\n - titan-12\n - titan-13\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-19\n - titan-20\n - titan-21\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/hecate.yaml"
echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
changed=1
fi
if ! grep -Eq '^ - services/gitea$' "${CONF_DIR}/hecate.yaml"; then
perl -0pi -e 's#local_bootstrap_paths:\n(?: - .*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml"
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
changed=1
fi
if perl -0777 -ne 'exit(!(/local_bootstrap_paths:\n - infrastructure\/core\n/s))' "${CONF_DIR}/hecate.yaml"; then
perl -0pi -e 's#local_bootstrap_paths:\n - infrastructure/core\n#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml"
echo "[install] expanded peer local_bootstrap_paths for full fallback bootstrap parity"
changed=1
fi
fi
if [[ "${changed}" -eq 1 ]]; then