diff --git a/README.md b/README.md index ddf222f..87a8560 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ See `configs/hecate.example.yaml`. UPS auto-shutdown trigger uses: - runtime threshold = `runtime_safety_factor * estimated_shutdown_budget` -- default safety factor `1.10` +- default safety factor `1.25` - debounce across multiple polls to avoid noise Estimated shutdown budget is derived from historical successful shutdown runs (`/var/lib/hecate/runs.json`) with default fallback from config. @@ -106,6 +106,16 @@ Power metrics: - `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically. - `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default. - `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively. +- Peer startup fallback now checks coordinator intent/bootstrap activity before allowing local startup. +- Automatic etcd recovery can run during startup if API never becomes reachable (`startup.auto_etcd_restore_on_api_failure`). + +## Etcd Recovery + +- Manual: `hecate etcd-restore --config /etc/hecate/hecate.yaml --execute` +- Optional snapshot override: `--snapshot /var/lib/rancher/k3s/server/db/snapshots/` +- Startup can automatically invoke the same restore path after API timeout using: + - `startup.auto_etcd_restore_on_api_failure: true` + - `startup.etcd_restore_control_plane: ` ## Disruptive startup drills diff --git a/cmd/hecate/main.go b/cmd/hecate/main.go index 70ebabd..eae3341 100644 --- a/cmd/hecate/main.go +++ b/cmd/hecate/main.go @@ -2,6 +2,7 @@ package main import ( "context" + "encoding/json" "errors" "flag" "fmt" @@ -41,6 +42,11 @@ func main() { logger.Printf("shutdown failed: %v", err) os.Exit(1) } + case "etcd-restore": + if err := runEtcdRestore(logger, os.Args[2:]); err != nil { + logger.Printf("etcd-restore failed: %v", err) + os.Exit(1) + } case "daemon": if err := runDaemon(logger, os.Args[2:]); err != nil { logger.Printf("daemon failed: %v", err) @@ -96,6 +102,15 @@ func runStartup(logger *log.Logger, args []string) error { logger.Printf("peer startup handoff complete; skipping local startup") return nil } + guardCtx, guardCancel := context.WithTimeout(context.Background(), time.Duration(maxInt(cfg.Coordination.CommandTimeoutSeconds, 15))*time.Second) + defer guardCancel() + allowed, guardReason, guardErr := coordinatorAllowsPeerFallbackStartup(guardCtx, cfg, logger) + if guardErr != nil { + return fmt.Errorf("startup blocked: unable to evaluate coordinator startup guard: %w", guardErr) + } + if !allowed { + return fmt.Errorf("startup blocked: coordinator guard disallowed peer fallback (%s)", guardReason) + } logger.Printf("peer startup handoff unavailable; proceeding with local peer startup fallback") allowPeer = true } else { @@ -174,6 +189,26 @@ func runDaemon(logger *log.Logger, args []string) error { return nil } +func runEtcdRestore(logger *log.Logger, args []string) error { + fs := flag.NewFlagSet("etcd-restore", flag.ExitOnError) + configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") + execute := fs.Bool("execute", false, "Actually execute restore (default dry-run)") + controlPlane := fs.String("control-plane", "", "Control plane to run restore on (defaults to startup.etcd_restore_control_plane)") + snapshotPath := fs.String("snapshot", "", "Explicit snapshot path (defaults to latest on selected control plane)") + _ = fs.Parse(args) + + _, orch, err := buildOrchestrator(logger, *configPath, !*execute) + if err != nil { + return err + } + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + return orch.EtcdRestore(ctx, cluster.EtcdRestoreOptions{ + ControlPlane: *controlPlane, + SnapshotPath: *snapshotPath, + }) +} + func runStatus(logger *log.Logger, args []string) error { fs := flag.NewFlagSet("status", flag.ExitOnError) configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file") @@ -279,20 +314,61 @@ func buildUPSTargets(cfg config.Config) ([]service.Target, error) { } func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error { - onBatteryTargets := []string{} + type targetState struct { + seenGood bool + lastErr error + } + states := make(map[string]*targetState, len(targets)) for _, t := range targets { - sample, err := t.Provider.Read(ctx) - if err != nil { - return fmt.Errorf("startup blocked: unable to verify UPS target %s (%s): %w", t.Name, t.Target, err) + key := t.Name + "|" + t.Target + states[key] = &targetState{} + } + const pollInterval = 3 * time.Second + for { + onBatteryTargets := []string{} + allSeen := true + for _, t := range targets { + key := t.Name + "|" + t.Target + st := states[key] + sample, err := t.Provider.Read(ctx) + if err != nil { + st.lastErr = err + if !st.seenGood { + allSeen = false + } + continue + } + st.seenGood = true + st.lastErr = nil + if sample.OnBattery { + onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds)) + } } - if sample.OnBattery { - onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds)) + if len(onBatteryTargets) > 0 { + return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", ")) + } + if allSeen { + return nil + } + select { + case <-ctx.Done(): + unverified := make([]string, 0, len(targets)) + for _, t := range targets { + key := t.Name + "|" + t.Target + st := states[key] + if st.seenGood { + continue + } + if st.lastErr != nil { + unverified = append(unverified, fmt.Sprintf("%s(%s): %v", t.Name, t.Target, st.lastErr)) + } else { + unverified = append(unverified, fmt.Sprintf("%s(%s): no telemetry sample yet", t.Name, t.Target)) + } + } + return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout: %s", strings.Join(unverified, " | ")) + case <-time.After(pollInterval): } } - if len(onBatteryTargets) > 0 { - return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", ")) - } - return nil } func buildOrchestrator(logger *log.Logger, cfgPath string, dryRun bool) (config.Config, *cluster.Orchestrator, error) { @@ -322,6 +398,7 @@ Usage: Commands: startup Perform staged cluster startup shutdown Perform graceful cluster shutdown + etcd-restore Restore etcd from snapshot on a control plane daemon Monitor UPS and auto-trigger shutdown status Print current hecate status and estimates intent Read or manually set intent state @@ -329,6 +406,7 @@ Commands: Examples: hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main hecate shutdown --config /etc/hecate/hecate.yaml --execute --reason "manual-maintenance" + hecate etcd-restore --config /etc/hecate/hecate.yaml --execute hecate daemon --config /etc/hecate/hecate.yaml hecate status --config /etc/hecate/hecate.yaml hecate intent --config /etc/hecate/hecate.yaml --set normal --reason "manual-clear" --execute @@ -358,30 +436,7 @@ func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log target = user + "@" + host } - args := []string{ - "-o", "BatchMode=yes", - "-o", "ConnectTimeout=8", - "-o", "StrictHostKeyChecking=accept-new", - } - if cfgPath := resolveSSHConfigFile(cfg); cfgPath != "" { - args = append(args, "-F", cfgPath) - } - if idPath := resolveSSHIdentityFile(cfg); idPath != "" { - args = append(args, "-i", idPath) - } - if cfg.SSHPort > 0 { - args = append(args, "-p", strconv.Itoa(cfg.SSHPort)) - } - if cfg.SSHJumpHost != "" { - jump := cfg.SSHJumpHost - if cfg.SSHJumpUser != "" { - jump = cfg.SSHJumpUser + "@" + jump - } - if cfg.SSHPort > 0 && !strings.Contains(jump, ":") { - jump = fmt.Sprintf("%s:%d", jump, cfg.SSHPort) - } - args = append(args, "-J", jump) - } + args := buildSSHBaseArgs(cfg) remote := "sudo -n systemctl start hecate-bootstrap.service" attempt := 1 @@ -409,6 +464,116 @@ func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log } } +func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config, logger *log.Logger) (bool, string, error) { + coordinator := strings.TrimSpace(cfg.Coordination.ForwardShutdownHost) + if coordinator == "" { + return true, "no coordinator configured", nil + } + user := strings.TrimSpace(cfg.Coordination.ForwardShutdownUser) + if user == "" { + if override, ok := cfg.SSHNodeUsers[coordinator]; ok && strings.TrimSpace(override) != "" { + user = strings.TrimSpace(override) + } else { + user = strings.TrimSpace(cfg.SSHUser) + } + } + host := coordinator + if mapped, ok := cfg.SSHNodeHosts[coordinator]; ok && strings.TrimSpace(mapped) != "" { + host = strings.TrimSpace(mapped) + } + target := host + if user != "" { + target = user + "@" + host + } + remoteCmd := "sudo -n sh -lc 'if systemctl is-active --quiet hecate-bootstrap.service; then echo __HECATE_BOOTSTRAP_ACTIVE__; else echo __HECATE_BOOTSTRAP_IDLE__; fi; if [ -s /var/lib/hecate/intent.json ]; then cat /var/lib/hecate/intent.json; else echo \"{}\"; fi'" + args := append(buildSSHBaseArgs(cfg), target, remoteCmd) + cmd := exec.CommandContext(ctx, "ssh", args...) + out, err := cmd.CombinedOutput() + if err != nil { + trimmed := strings.TrimSpace(string(out)) + if trimmed == "" { + logger.Printf("warning: coordinator guard check unavailable on %s: %v; allowing peer fallback startup", coordinator, err) + } else { + logger.Printf("warning: coordinator guard check unavailable on %s: %v: %s; allowing peer fallback startup", coordinator, err, trimmed) + } + return true, "coordinator unreachable", nil + } + trimmed := strings.TrimSpace(string(out)) + if strings.Contains(trimmed, "__HECATE_BOOTSTRAP_ACTIVE__") { + return false, "coordinator bootstrap service is active", nil + } + start := strings.Index(trimmed, "{") + end := strings.LastIndex(trimmed, "}") + if start < 0 || end < start { + return false, "coordinator intent payload missing", nil + } + rawIntent := trimmed[start : end+1] + var remoteIntent state.Intent + if err := json.Unmarshal([]byte(rawIntent), &remoteIntent); err != nil { + return false, "", fmt.Errorf("decode coordinator intent: %w", err) + } + if remoteIntent.State == "" || remoteIntent.State == state.IntentNormal { + return true, "coordinator intent is normal", nil + } + guardAge := time.Duration(maxInt(cfg.Coordination.StartupGuardMaxAgeSec, 60)) * time.Second + intentAge := time.Duration(0) + if !remoteIntent.UpdatedAt.IsZero() { + intentAge = time.Since(remoteIntent.UpdatedAt) + } + switch remoteIntent.State { + case state.IntentShuttingDown: + if remoteIntent.UpdatedAt.IsZero() || intentAge <= guardAge { + return false, fmt.Sprintf("coordinator intent=%s age=%s reason=%q", remoteIntent.State, intentAge.Round(time.Second), remoteIntent.Reason), nil + } + logger.Printf("warning: coordinator shutdown intent appears stale (age=%s > guard=%s); allowing peer fallback startup", intentAge.Round(time.Second), guardAge) + return true, "coordinator shutdown intent stale", nil + case state.IntentStartupInProgress: + if remoteIntent.UpdatedAt.IsZero() || intentAge <= guardAge { + return false, fmt.Sprintf("coordinator intent=%s age=%s reason=%q", remoteIntent.State, intentAge.Round(time.Second), remoteIntent.Reason), nil + } + logger.Printf("warning: coordinator startup intent appears stale (age=%s > guard=%s); allowing peer fallback startup", intentAge.Round(time.Second), guardAge) + return true, "coordinator startup intent stale", nil + case state.IntentShutdownComplete: + if remoteIntent.UpdatedAt.IsZero() { + return false, "coordinator reported shutdown_complete with unknown age", nil + } + if intentAge <= 45*time.Second { + return false, fmt.Sprintf("coordinator recently completed shutdown (%s ago)", intentAge.Round(time.Second)), nil + } + return true, "coordinator shutdown_complete is old enough", nil + default: + return false, fmt.Sprintf("coordinator intent state %q is unknown", remoteIntent.State), nil + } +} + +func buildSSHBaseArgs(cfg config.Config) []string { + args := []string{ + "-o", "BatchMode=yes", + "-o", "ConnectTimeout=8", + "-o", "StrictHostKeyChecking=accept-new", + } + if cfgPath := resolveSSHConfigFile(cfg); cfgPath != "" { + args = append(args, "-F", cfgPath) + } + if idPath := resolveSSHIdentityFile(cfg); idPath != "" { + args = append(args, "-i", idPath) + } + if cfg.SSHPort > 0 { + args = append(args, "-p", strconv.Itoa(cfg.SSHPort)) + } + if cfg.SSHJumpHost != "" { + jump := cfg.SSHJumpHost + if cfg.SSHJumpUser != "" { + jump = cfg.SSHJumpUser + "@" + jump + } + if cfg.SSHPort > 0 && !strings.Contains(jump, ":") { + jump = fmt.Sprintf("%s:%d", jump, cfg.SSHPort) + } + args = append(args, "-J", jump) + } + return args +} + func resolveSSHConfigFile(cfg config.Config) string { if strings.TrimSpace(cfg.SSHConfigFile) != "" { return strings.TrimSpace(cfg.SSHConfigFile) diff --git a/configs/hecate.example.yaml b/configs/hecate.example.yaml index 947abf0..382cb41 100644 --- a/configs/hecate.example.yaml +++ b/configs/hecate.example.yaml @@ -42,6 +42,8 @@ excluded_namespaces: startup: api_wait_seconds: 1200 api_poll_seconds: 2 + auto_etcd_restore_on_api_failure: true + etcd_restore_control_plane: titan-0a shutdown: default_budget_seconds: 1380 skip_etcd_snapshot: false @@ -71,6 +73,7 @@ coordination: forward_shutdown_config: /etc/hecate/hecate.yaml fallback_local_shutdown: true command_timeout_seconds: 25 + startup_guard_max_age_seconds: 900 role: coordinator allow_startup_on_battery: false metrics: diff --git a/configs/hecate.tethys.yaml b/configs/hecate.tethys.yaml index ab44955..a24b0a1 100644 --- a/configs/hecate.tethys.yaml +++ b/configs/hecate.tethys.yaml @@ -35,11 +35,23 @@ ssh_managed_nodes: - titan-0a - titan-0b - titan-0c + - titan-04 + - titan-05 + - titan-06 + - titan-07 + - titan-08 + - titan-09 + - titan-10 + - titan-11 - titan-12 + - titan-13 - titan-14 - titan-15 - titan-17 - titan-18 + - titan-19 + - titan-20 + - titan-21 - titan-22 - titan-24 ssh_jump_host: "" @@ -53,6 +65,15 @@ control_planes: workers: [] local_bootstrap_paths: - infrastructure/core + - clusters/atlas/flux-system + - infrastructure/sources/helm + - infrastructure/metallb + - infrastructure/traefik + - infrastructure/vault-csi + - infrastructure/vault-injector + - services/vault + - infrastructure/postgres + - services/gitea excluded_namespaces: - kube-system - kube-public @@ -68,6 +89,8 @@ excluded_namespaces: startup: api_wait_seconds: 1200 api_poll_seconds: 2 + auto_etcd_restore_on_api_failure: true + etcd_restore_control_plane: titan-0a shutdown: default_budget_seconds: 1380 skip_etcd_snapshot: false @@ -95,6 +118,7 @@ coordination: forward_shutdown_config: /etc/hecate/hecate.yaml fallback_local_shutdown: false command_timeout_seconds: 25 + startup_guard_max_age_seconds: 900 role: peer allow_startup_on_battery: false metrics: diff --git a/configs/hecate.titan-db.yaml b/configs/hecate.titan-db.yaml index a0c0a7e..2e8bd68 100644 --- a/configs/hecate.titan-db.yaml +++ b/configs/hecate.titan-db.yaml @@ -35,12 +35,25 @@ ssh_managed_nodes: - titan-0a - titan-0b - titan-0c + - titan-04 + - titan-05 + - titan-06 + - titan-07 + - titan-08 + - titan-09 + - titan-10 + - titan-11 - titan-12 + - titan-13 - titan-14 - titan-15 - titan-17 - titan-18 + - titan-19 + - titan-20 + - titan-21 - titan-22 + - titan-24 ssh_jump_host: "" ssh_jump_user: "" iac_repo_path: /opt/titan-iac @@ -76,6 +89,8 @@ excluded_namespaces: startup: api_wait_seconds: 1200 api_poll_seconds: 2 + auto_etcd_restore_on_api_failure: true + etcd_restore_control_plane: titan-0a shutdown: default_budget_seconds: 1380 skip_etcd_snapshot: false @@ -104,6 +119,7 @@ coordination: forward_shutdown_config: /etc/hecate/hecate.yaml fallback_local_shutdown: true command_timeout_seconds: 25 + startup_guard_max_age_seconds: 900 role: coordinator allow_startup_on_battery: false metrics: diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index 5af1051..962a415 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -39,6 +39,11 @@ type ShutdownOptions struct { Reason string } +type EtcdRestoreOptions struct { + ControlPlane string + SnapshotPath string +} + type startupWorkload struct { Namespace string Kind string @@ -121,7 +126,20 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er apiAttempts = 1 } if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil { - return err + if !o.cfg.Startup.AutoEtcdRestoreOnAPIFailure { + return err + } + cp := strings.TrimSpace(o.cfg.Startup.EtcdRestoreControlPlane) + if cp == "" && len(o.cfg.ControlPlanes) > 0 { + cp = o.cfg.ControlPlanes[0] + } + o.log.Printf("warning: initial API wait failed (%v); attempting automatic etcd restore on %s", err, cp) + if restoreErr := o.EtcdRestore(ctx, EtcdRestoreOptions{ControlPlane: cp}); restoreErr != nil { + return fmt.Errorf("kubernetes API did not become reachable and automatic etcd restore failed: %w", restoreErr) + } + if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil { + return fmt.Errorf("kubernetes API did not become reachable after automatic etcd restore: %w", err) + } } workers, err := o.effectiveWorkers(ctx) @@ -200,6 +218,72 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er return nil } +func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions) error { + controlPlane := strings.TrimSpace(opts.ControlPlane) + if controlPlane == "" { + if len(o.cfg.ControlPlanes) == 0 { + return fmt.Errorf("cannot restore etcd: no control planes configured") + } + controlPlane = o.cfg.ControlPlanes[0] + } + found := false + for _, cp := range o.cfg.ControlPlanes { + if cp == controlPlane { + found = true + break + } + } + if !found { + return fmt.Errorf("cannot restore etcd: control plane %s is not in configured control_planes", controlPlane) + } + if !o.sshManaged(controlPlane) { + return fmt.Errorf("cannot restore etcd on %s: node not in ssh_managed_nodes", controlPlane) + } + + snapshotPath := strings.TrimSpace(opts.SnapshotPath) + if snapshotPath == "" { + resolved, err := o.latestEtcdSnapshotPath(ctx, controlPlane) + if err != nil { + return err + } + snapshotPath = resolved + } + o.log.Printf("etcd restore target=%s snapshot=%s", controlPlane, snapshotPath) + if o.runner.DryRun { + return nil + } + + for _, cp := range o.cfg.ControlPlanes { + cp := cp + o.bestEffort("stop k3s before etcd restore on "+cp, func() error { + _, err := o.ssh(ctx, cp, "sudo systemctl stop k3s || true") + return err + }) + } + + restoreCmd := fmt.Sprintf("sudo k3s server --cluster-reset --cluster-reset-restore-path %q", snapshotPath) + if _, err := o.ssh(ctx, controlPlane, restoreCmd); err != nil { + return fmt.Errorf("etcd restore command failed on %s: %w", controlPlane, err) + } + o.log.Printf("etcd restore command completed on %s", controlPlane) + + if _, err := o.ssh(ctx, controlPlane, "sudo systemctl start k3s || true"); err != nil { + return fmt.Errorf("failed to start k3s on restore control plane %s: %w", controlPlane, err) + } + time.Sleep(10 * time.Second) + for _, cp := range o.cfg.ControlPlanes { + cp := cp + if cp == controlPlane { + continue + } + o.bestEffort("start k3s after etcd restore on "+cp, func() error { + _, err := o.ssh(ctx, cp, "sudo systemctl start k3s || true") + return err + }) + } + return nil +} + func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err error) { unlock, err := state.AcquireLock(o.cfg.State.LockPath) if err != nil { @@ -731,6 +815,22 @@ func (o *Orchestrator) takeEtcdSnapshot(ctx context.Context, node string) error return err } +func (o *Orchestrator) latestEtcdSnapshotPath(ctx context.Context, node string) (string, error) { + if !o.sshManaged(node) { + return "", fmt.Errorf("cannot resolve etcd snapshot on %s: node not in ssh_managed_nodes", node) + } + cmd := `sudo sh -lc 'ls -1t /var/lib/rancher/k3s/server/db/snapshots/* 2>/dev/null | head -n 1'` + out, err := o.ssh(ctx, node, cmd) + if err != nil { + return "", fmt.Errorf("resolve latest etcd snapshot on %s: %w", node, err) + } + snapshot := strings.TrimSpace(out) + if snapshot == "" { + return "", fmt.Errorf("no etcd snapshots found on %s under /var/lib/rancher/k3s/server/db/snapshots", node) + } + return snapshot, nil +} + func (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error { if o.runner.DryRun { return nil diff --git a/internal/config/config.go b/internal/config/config.go index 4077f25..40c1344 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -33,8 +33,10 @@ type Config struct { } type Startup struct { - APIWaitSeconds int `yaml:"api_wait_seconds"` - APIPollSeconds int `yaml:"api_poll_seconds"` + APIWaitSeconds int `yaml:"api_wait_seconds"` + APIPollSeconds int `yaml:"api_poll_seconds"` + AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` + EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` } type Shutdown struct { @@ -72,6 +74,7 @@ type Coordination struct { ForwardShutdownConfig string `yaml:"forward_shutdown_config"` FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"` CommandTimeoutSeconds int `yaml:"command_timeout_seconds"` + StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"` Role string `yaml:"role"` AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"` } @@ -135,6 +138,18 @@ func (c Config) Validate() error { if c.Startup.APIPollSeconds <= 0 { return fmt.Errorf("config.startup.api_poll_seconds must be > 0") } + if c.Startup.EtcdRestoreControlPlane != "" { + found := false + for _, cp := range c.ControlPlanes { + if cp == c.Startup.EtcdRestoreControlPlane { + found = true + break + } + } + if !found { + return fmt.Errorf("config.startup.etcd_restore_control_plane must be one of config.control_planes when set") + } + } if c.SSHPort <= 0 || c.SSHPort > 65535 { return fmt.Errorf("config.ssh_port must be in range 1-65535") } @@ -156,6 +171,9 @@ func (c Config) Validate() error { return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set") } } + if c.Coordination.StartupGuardMaxAgeSec <= 0 { + return fmt.Errorf("config.coordination.startup_guard_max_age_seconds must be > 0") + } if c.Coordination.Role != "coordinator" && c.Coordination.Role != "peer" { return fmt.Errorf("config.coordination.role must be coordinator or peer") } @@ -200,8 +218,10 @@ func defaults() Config { "maintenance", }, Startup: Startup{ - APIWaitSeconds: 1200, - APIPollSeconds: 2, + APIWaitSeconds: 1200, + APIPollSeconds: 2, + AutoEtcdRestoreOnAPIFailure: true, + EtcdRestoreControlPlane: "titan-0a", }, Shutdown: Shutdown{ DefaultBudgetSeconds: 1380, @@ -224,6 +244,7 @@ func defaults() Config { ForwardShutdownConfig: "/etc/hecate/hecate.yaml", FallbackLocalShutdown: true, CommandTimeoutSeconds: 25, + StartupGuardMaxAgeSec: 900, Role: "coordinator", AllowStartupOnBattery: false, }, @@ -256,6 +277,9 @@ func (c *Config) applyDefaults() { if c.Startup.APIPollSeconds <= 0 { c.Startup.APIPollSeconds = 2 } + if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 { + c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0] + } if c.SSHPort <= 0 { c.SSHPort = 2277 } @@ -292,6 +316,9 @@ func (c *Config) applyDefaults() { if c.Coordination.CommandTimeoutSeconds <= 0 { c.Coordination.CommandTimeoutSeconds = 25 } + if c.Coordination.StartupGuardMaxAgeSec <= 0 { + c.Coordination.StartupGuardMaxAgeSec = 900 + } if c.Coordination.Role == "" { c.Coordination.Role = "coordinator" } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index bce7fd9..d08dfb7 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -55,3 +55,41 @@ func TestValidateRejectsUnknownRole(t *testing.T) { t.Fatalf("expected validation error for unknown coordination role") } } + +func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) { + cfg := defaults() + cfg.Startup.EtcdRestoreControlPlane = "titan-missing" + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error for unknown etcd restore control plane") + } +} + +func TestLoadSetsCoordinationGuardDefaults(t *testing.T) { + tmp := t.TempDir() + cfgPath := filepath.Join(tmp, "hecate.yaml") + raw := ` +control_planes: [titan-0a, titan-0b, titan-0c] +expected_flux_branch: main +iac_repo_path: /opt/titan-iac +coordination: + role: coordinator +ups: + enabled: false +state: + run_history_path: /tmp/runs.json + lock_path: /tmp/hecate.lock +` + if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil { + t.Fatalf("write config: %v", err) + } + cfg, err := Load(cfgPath) + if err != nil { + t.Fatalf("load config: %v", err) + } + if cfg.Coordination.StartupGuardMaxAgeSec <= 0 { + t.Fatalf("expected startup guard max age default > 0, got %d", cfg.Coordination.StartupGuardMaxAgeSec) + } + if cfg.Startup.EtcdRestoreControlPlane == "" { + t.Fatalf("expected startup etcd restore control plane default to be set") + } +} diff --git a/scripts/install.sh b/scripts/install.sh index e7cffcd..26017fb 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -186,6 +186,23 @@ migrate_hecate_config() { echo "[install] migrated ssh_node_users titan-24 override to atlas" changed=1 fi + if grep -Eq '^ command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml" \ + && ! grep -Eq '^ startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml"; then + sed -Ei '/^ command_timeout_seconds:[[:space:]]*[0-9]+/a\ startup_guard_max_age_seconds: 900' "${CONF_DIR}/hecate.yaml" + echo "[install] added coordination.startup_guard_max_age_seconds=900" + changed=1 + fi + local default_restore_cp + default_restore_cp="$(first_control_plane_name)" + if [[ -z "${default_restore_cp}" ]]; then + default_restore_cp="titan-0a" + fi + if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml" \ + && ! grep -Eq '^ auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/hecate.yaml"; then + sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ auto_etcd_restore_on_api_failure: true\n etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/hecate.yaml" + echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults" + changed=1 + fi local role role="$(read_hecate_role)" @@ -221,12 +238,25 @@ migrate_hecate_config() { - titan-0a - titan-0b - titan-0c + - titan-04 + - titan-05 + - titan-06 + - titan-07 + - titan-08 + - titan-09 + - titan-10 + - titan-11 - titan-12 + - titan-13 - titan-14 - titan-15 - titan-17 - titan-18 - - titan-22' + - titan-19 + - titan-20 + - titan-21 + - titan-22 + - titan-24' elif [[ "${role}" == "peer" ]]; then inventory_block='ssh_node_hosts: titan-db: 192.168.22.10 @@ -257,11 +287,23 @@ migrate_hecate_config() { - titan-0a - titan-0b - titan-0c + - titan-04 + - titan-05 + - titan-06 + - titan-07 + - titan-08 + - titan-09 + - titan-10 + - titan-11 - titan-12 + - titan-13 - titan-14 - titan-15 - titan-17 - titan-18 + - titan-19 + - titan-20 + - titan-21 - titan-22 - titan-24' fi @@ -280,6 +322,11 @@ migrate_hecate_config() { echo "[install] hydrated ssh_managed_nodes inventory for role=${role}" changed=1 fi + if ! grep -Eq '^ - titan-04$' "${CONF_DIR}/hecate.yaml" || ! grep -Eq '^ - titan-21$' "${CONF_DIR}/hecate.yaml"; then + perl -0pi -e 's#ssh_managed_nodes:\n(?: - .*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/hecate.yaml" + echo "[install] refreshed ssh_managed_nodes coverage for role=${role}" + changed=1 + fi fi if [[ "${role}" == "peer" ]]; then @@ -287,10 +334,22 @@ migrate_hecate_config() { && grep -Eq '^ - titan-db$' "${CONF_DIR}/hecate.yaml" \ && grep -Eq '^ - titan-24$' "${CONF_DIR}/hecate.yaml" \ && ! grep -Eq '^ - titan-0a$' "${CONF_DIR}/hecate.yaml"; then - perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-12\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/hecate.yaml" + perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-04\n - titan-05\n - titan-06\n - titan-07\n - titan-08\n - titan-09\n - titan-10\n - titan-11\n - titan-12\n - titan-13\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-19\n - titan-20\n - titan-21\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/hecate.yaml" echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage" changed=1 fi + + if ! grep -Eq '^ - services/gitea$' "${CONF_DIR}/hecate.yaml"; then + perl -0pi -e 's#local_bootstrap_paths:\n(?: - .*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml" + echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity" + changed=1 + fi + + if perl -0777 -ne 'exit(!(/local_bootstrap_paths:\n - infrastructure\/core\n/s))' "${CONF_DIR}/hecate.yaml"; then + perl -0pi -e 's#local_bootstrap_paths:\n - infrastructure/core\n#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml" + echo "[install] expanded peer local_bootstrap_paths for full fallback bootstrap parity" + changed=1 + fi fi if [[ "${changed}" -eq 1 ]]; then