hecate: harden outage recovery startup and etcd restore
This commit is contained in:
parent
19562d77f7
commit
5d8bfd5de6
12
README.md
12
README.md
@ -90,7 +90,7 @@ See `configs/hecate.example.yaml`.
|
||||
|
||||
UPS auto-shutdown trigger uses:
|
||||
- runtime threshold = `runtime_safety_factor * estimated_shutdown_budget`
|
||||
- default safety factor `1.10`
|
||||
- default safety factor `1.25`
|
||||
- debounce across multiple polls to avoid noise
|
||||
|
||||
Estimated shutdown budget is derived from historical successful shutdown runs (`/var/lib/hecate/runs.json`) with default fallback from config.
|
||||
@ -106,6 +106,16 @@ Power metrics:
|
||||
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
|
||||
- `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default.
|
||||
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
|
||||
- Peer startup fallback now checks coordinator intent/bootstrap activity before allowing local startup.
|
||||
- Automatic etcd recovery can run during startup if API never becomes reachable (`startup.auto_etcd_restore_on_api_failure`).
|
||||
|
||||
## Etcd Recovery
|
||||
|
||||
- Manual: `hecate etcd-restore --config /etc/hecate/hecate.yaml --execute`
|
||||
- Optional snapshot override: `--snapshot /var/lib/rancher/k3s/server/db/snapshots/<name>`
|
||||
- Startup can automatically invoke the same restore path after API timeout using:
|
||||
- `startup.auto_etcd_restore_on_api_failure: true`
|
||||
- `startup.etcd_restore_control_plane: <control-plane-node>`
|
||||
|
||||
## Disruptive startup drills
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
@ -41,6 +42,11 @@ func main() {
|
||||
logger.Printf("shutdown failed: %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
case "etcd-restore":
|
||||
if err := runEtcdRestore(logger, os.Args[2:]); err != nil {
|
||||
logger.Printf("etcd-restore failed: %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
case "daemon":
|
||||
if err := runDaemon(logger, os.Args[2:]); err != nil {
|
||||
logger.Printf("daemon failed: %v", err)
|
||||
@ -96,6 +102,15 @@ func runStartup(logger *log.Logger, args []string) error {
|
||||
logger.Printf("peer startup handoff complete; skipping local startup")
|
||||
return nil
|
||||
}
|
||||
guardCtx, guardCancel := context.WithTimeout(context.Background(), time.Duration(maxInt(cfg.Coordination.CommandTimeoutSeconds, 15))*time.Second)
|
||||
defer guardCancel()
|
||||
allowed, guardReason, guardErr := coordinatorAllowsPeerFallbackStartup(guardCtx, cfg, logger)
|
||||
if guardErr != nil {
|
||||
return fmt.Errorf("startup blocked: unable to evaluate coordinator startup guard: %w", guardErr)
|
||||
}
|
||||
if !allowed {
|
||||
return fmt.Errorf("startup blocked: coordinator guard disallowed peer fallback (%s)", guardReason)
|
||||
}
|
||||
logger.Printf("peer startup handoff unavailable; proceeding with local peer startup fallback")
|
||||
allowPeer = true
|
||||
} else {
|
||||
@ -174,6 +189,26 @@ func runDaemon(logger *log.Logger, args []string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func runEtcdRestore(logger *log.Logger, args []string) error {
|
||||
fs := flag.NewFlagSet("etcd-restore", flag.ExitOnError)
|
||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
||||
execute := fs.Bool("execute", false, "Actually execute restore (default dry-run)")
|
||||
controlPlane := fs.String("control-plane", "", "Control plane to run restore on (defaults to startup.etcd_restore_control_plane)")
|
||||
snapshotPath := fs.String("snapshot", "", "Explicit snapshot path (defaults to latest on selected control plane)")
|
||||
_ = fs.Parse(args)
|
||||
|
||||
_, orch, err := buildOrchestrator(logger, *configPath, !*execute)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
return orch.EtcdRestore(ctx, cluster.EtcdRestoreOptions{
|
||||
ControlPlane: *controlPlane,
|
||||
SnapshotPath: *snapshotPath,
|
||||
})
|
||||
}
|
||||
|
||||
func runStatus(logger *log.Logger, args []string) error {
|
||||
fs := flag.NewFlagSet("status", flag.ExitOnError)
|
||||
configPath := fs.String("config", "/etc/hecate/hecate.yaml", "Path to config file")
|
||||
@ -279,12 +314,32 @@ func buildUPSTargets(cfg config.Config) ([]service.Target, error) {
|
||||
}
|
||||
|
||||
func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error {
|
||||
onBatteryTargets := []string{}
|
||||
type targetState struct {
|
||||
seenGood bool
|
||||
lastErr error
|
||||
}
|
||||
states := make(map[string]*targetState, len(targets))
|
||||
for _, t := range targets {
|
||||
key := t.Name + "|" + t.Target
|
||||
states[key] = &targetState{}
|
||||
}
|
||||
const pollInterval = 3 * time.Second
|
||||
for {
|
||||
onBatteryTargets := []string{}
|
||||
allSeen := true
|
||||
for _, t := range targets {
|
||||
key := t.Name + "|" + t.Target
|
||||
st := states[key]
|
||||
sample, err := t.Provider.Read(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("startup blocked: unable to verify UPS target %s (%s): %w", t.Name, t.Target, err)
|
||||
st.lastErr = err
|
||||
if !st.seenGood {
|
||||
allSeen = false
|
||||
}
|
||||
continue
|
||||
}
|
||||
st.seenGood = true
|
||||
st.lastErr = nil
|
||||
if sample.OnBattery {
|
||||
onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds))
|
||||
}
|
||||
@ -292,8 +347,29 @@ func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error
|
||||
if len(onBatteryTargets) > 0 {
|
||||
return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", "))
|
||||
}
|
||||
if allSeen {
|
||||
return nil
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
unverified := make([]string, 0, len(targets))
|
||||
for _, t := range targets {
|
||||
key := t.Name + "|" + t.Target
|
||||
st := states[key]
|
||||
if st.seenGood {
|
||||
continue
|
||||
}
|
||||
if st.lastErr != nil {
|
||||
unverified = append(unverified, fmt.Sprintf("%s(%s): %v", t.Name, t.Target, st.lastErr))
|
||||
} else {
|
||||
unverified = append(unverified, fmt.Sprintf("%s(%s): no telemetry sample yet", t.Name, t.Target))
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("startup blocked: unable to verify UPS telemetry before timeout: %s", strings.Join(unverified, " | "))
|
||||
case <-time.After(pollInterval):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func buildOrchestrator(logger *log.Logger, cfgPath string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
|
||||
cfg, err := config.Load(cfgPath)
|
||||
@ -322,6 +398,7 @@ Usage:
|
||||
Commands:
|
||||
startup Perform staged cluster startup
|
||||
shutdown Perform graceful cluster shutdown
|
||||
etcd-restore Restore etcd from snapshot on a control plane
|
||||
daemon Monitor UPS and auto-trigger shutdown
|
||||
status Print current hecate status and estimates
|
||||
intent Read or manually set intent state
|
||||
@ -329,6 +406,7 @@ Commands:
|
||||
Examples:
|
||||
hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main
|
||||
hecate shutdown --config /etc/hecate/hecate.yaml --execute --reason "manual-maintenance"
|
||||
hecate etcd-restore --config /etc/hecate/hecate.yaml --execute
|
||||
hecate daemon --config /etc/hecate/hecate.yaml
|
||||
hecate status --config /etc/hecate/hecate.yaml
|
||||
hecate intent --config /etc/hecate/hecate.yaml --set normal --reason "manual-clear" --execute
|
||||
@ -358,30 +436,7 @@ func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log
|
||||
target = user + "@" + host
|
||||
}
|
||||
|
||||
args := []string{
|
||||
"-o", "BatchMode=yes",
|
||||
"-o", "ConnectTimeout=8",
|
||||
"-o", "StrictHostKeyChecking=accept-new",
|
||||
}
|
||||
if cfgPath := resolveSSHConfigFile(cfg); cfgPath != "" {
|
||||
args = append(args, "-F", cfgPath)
|
||||
}
|
||||
if idPath := resolveSSHIdentityFile(cfg); idPath != "" {
|
||||
args = append(args, "-i", idPath)
|
||||
}
|
||||
if cfg.SSHPort > 0 {
|
||||
args = append(args, "-p", strconv.Itoa(cfg.SSHPort))
|
||||
}
|
||||
if cfg.SSHJumpHost != "" {
|
||||
jump := cfg.SSHJumpHost
|
||||
if cfg.SSHJumpUser != "" {
|
||||
jump = cfg.SSHJumpUser + "@" + jump
|
||||
}
|
||||
if cfg.SSHPort > 0 && !strings.Contains(jump, ":") {
|
||||
jump = fmt.Sprintf("%s:%d", jump, cfg.SSHPort)
|
||||
}
|
||||
args = append(args, "-J", jump)
|
||||
}
|
||||
args := buildSSHBaseArgs(cfg)
|
||||
|
||||
remote := "sudo -n systemctl start hecate-bootstrap.service"
|
||||
attempt := 1
|
||||
@ -409,6 +464,116 @@ func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log
|
||||
}
|
||||
}
|
||||
|
||||
func coordinatorAllowsPeerFallbackStartup(ctx context.Context, cfg config.Config, logger *log.Logger) (bool, string, error) {
|
||||
coordinator := strings.TrimSpace(cfg.Coordination.ForwardShutdownHost)
|
||||
if coordinator == "" {
|
||||
return true, "no coordinator configured", nil
|
||||
}
|
||||
user := strings.TrimSpace(cfg.Coordination.ForwardShutdownUser)
|
||||
if user == "" {
|
||||
if override, ok := cfg.SSHNodeUsers[coordinator]; ok && strings.TrimSpace(override) != "" {
|
||||
user = strings.TrimSpace(override)
|
||||
} else {
|
||||
user = strings.TrimSpace(cfg.SSHUser)
|
||||
}
|
||||
}
|
||||
host := coordinator
|
||||
if mapped, ok := cfg.SSHNodeHosts[coordinator]; ok && strings.TrimSpace(mapped) != "" {
|
||||
host = strings.TrimSpace(mapped)
|
||||
}
|
||||
target := host
|
||||
if user != "" {
|
||||
target = user + "@" + host
|
||||
}
|
||||
remoteCmd := "sudo -n sh -lc 'if systemctl is-active --quiet hecate-bootstrap.service; then echo __HECATE_BOOTSTRAP_ACTIVE__; else echo __HECATE_BOOTSTRAP_IDLE__; fi; if [ -s /var/lib/hecate/intent.json ]; then cat /var/lib/hecate/intent.json; else echo \"{}\"; fi'"
|
||||
args := append(buildSSHBaseArgs(cfg), target, remoteCmd)
|
||||
cmd := exec.CommandContext(ctx, "ssh", args...)
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
trimmed := strings.TrimSpace(string(out))
|
||||
if trimmed == "" {
|
||||
logger.Printf("warning: coordinator guard check unavailable on %s: %v; allowing peer fallback startup", coordinator, err)
|
||||
} else {
|
||||
logger.Printf("warning: coordinator guard check unavailable on %s: %v: %s; allowing peer fallback startup", coordinator, err, trimmed)
|
||||
}
|
||||
return true, "coordinator unreachable", nil
|
||||
}
|
||||
trimmed := strings.TrimSpace(string(out))
|
||||
if strings.Contains(trimmed, "__HECATE_BOOTSTRAP_ACTIVE__") {
|
||||
return false, "coordinator bootstrap service is active", nil
|
||||
}
|
||||
start := strings.Index(trimmed, "{")
|
||||
end := strings.LastIndex(trimmed, "}")
|
||||
if start < 0 || end < start {
|
||||
return false, "coordinator intent payload missing", nil
|
||||
}
|
||||
rawIntent := trimmed[start : end+1]
|
||||
var remoteIntent state.Intent
|
||||
if err := json.Unmarshal([]byte(rawIntent), &remoteIntent); err != nil {
|
||||
return false, "", fmt.Errorf("decode coordinator intent: %w", err)
|
||||
}
|
||||
if remoteIntent.State == "" || remoteIntent.State == state.IntentNormal {
|
||||
return true, "coordinator intent is normal", nil
|
||||
}
|
||||
guardAge := time.Duration(maxInt(cfg.Coordination.StartupGuardMaxAgeSec, 60)) * time.Second
|
||||
intentAge := time.Duration(0)
|
||||
if !remoteIntent.UpdatedAt.IsZero() {
|
||||
intentAge = time.Since(remoteIntent.UpdatedAt)
|
||||
}
|
||||
switch remoteIntent.State {
|
||||
case state.IntentShuttingDown:
|
||||
if remoteIntent.UpdatedAt.IsZero() || intentAge <= guardAge {
|
||||
return false, fmt.Sprintf("coordinator intent=%s age=%s reason=%q", remoteIntent.State, intentAge.Round(time.Second), remoteIntent.Reason), nil
|
||||
}
|
||||
logger.Printf("warning: coordinator shutdown intent appears stale (age=%s > guard=%s); allowing peer fallback startup", intentAge.Round(time.Second), guardAge)
|
||||
return true, "coordinator shutdown intent stale", nil
|
||||
case state.IntentStartupInProgress:
|
||||
if remoteIntent.UpdatedAt.IsZero() || intentAge <= guardAge {
|
||||
return false, fmt.Sprintf("coordinator intent=%s age=%s reason=%q", remoteIntent.State, intentAge.Round(time.Second), remoteIntent.Reason), nil
|
||||
}
|
||||
logger.Printf("warning: coordinator startup intent appears stale (age=%s > guard=%s); allowing peer fallback startup", intentAge.Round(time.Second), guardAge)
|
||||
return true, "coordinator startup intent stale", nil
|
||||
case state.IntentShutdownComplete:
|
||||
if remoteIntent.UpdatedAt.IsZero() {
|
||||
return false, "coordinator reported shutdown_complete with unknown age", nil
|
||||
}
|
||||
if intentAge <= 45*time.Second {
|
||||
return false, fmt.Sprintf("coordinator recently completed shutdown (%s ago)", intentAge.Round(time.Second)), nil
|
||||
}
|
||||
return true, "coordinator shutdown_complete is old enough", nil
|
||||
default:
|
||||
return false, fmt.Sprintf("coordinator intent state %q is unknown", remoteIntent.State), nil
|
||||
}
|
||||
}
|
||||
|
||||
func buildSSHBaseArgs(cfg config.Config) []string {
|
||||
args := []string{
|
||||
"-o", "BatchMode=yes",
|
||||
"-o", "ConnectTimeout=8",
|
||||
"-o", "StrictHostKeyChecking=accept-new",
|
||||
}
|
||||
if cfgPath := resolveSSHConfigFile(cfg); cfgPath != "" {
|
||||
args = append(args, "-F", cfgPath)
|
||||
}
|
||||
if idPath := resolveSSHIdentityFile(cfg); idPath != "" {
|
||||
args = append(args, "-i", idPath)
|
||||
}
|
||||
if cfg.SSHPort > 0 {
|
||||
args = append(args, "-p", strconv.Itoa(cfg.SSHPort))
|
||||
}
|
||||
if cfg.SSHJumpHost != "" {
|
||||
jump := cfg.SSHJumpHost
|
||||
if cfg.SSHJumpUser != "" {
|
||||
jump = cfg.SSHJumpUser + "@" + jump
|
||||
}
|
||||
if cfg.SSHPort > 0 && !strings.Contains(jump, ":") {
|
||||
jump = fmt.Sprintf("%s:%d", jump, cfg.SSHPort)
|
||||
}
|
||||
args = append(args, "-J", jump)
|
||||
}
|
||||
return args
|
||||
}
|
||||
|
||||
func resolveSSHConfigFile(cfg config.Config) string {
|
||||
if strings.TrimSpace(cfg.SSHConfigFile) != "" {
|
||||
return strings.TrimSpace(cfg.SSHConfigFile)
|
||||
|
||||
@ -42,6 +42,8 @@ excluded_namespaces:
|
||||
startup:
|
||||
api_wait_seconds: 1200
|
||||
api_poll_seconds: 2
|
||||
auto_etcd_restore_on_api_failure: true
|
||||
etcd_restore_control_plane: titan-0a
|
||||
shutdown:
|
||||
default_budget_seconds: 1380
|
||||
skip_etcd_snapshot: false
|
||||
@ -71,6 +73,7 @@ coordination:
|
||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||
fallback_local_shutdown: true
|
||||
command_timeout_seconds: 25
|
||||
startup_guard_max_age_seconds: 900
|
||||
role: coordinator
|
||||
allow_startup_on_battery: false
|
||||
metrics:
|
||||
|
||||
@ -35,11 +35,23 @@ ssh_managed_nodes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
- titan-04
|
||||
- titan-05
|
||||
- titan-06
|
||||
- titan-07
|
||||
- titan-08
|
||||
- titan-09
|
||||
- titan-10
|
||||
- titan-11
|
||||
- titan-12
|
||||
- titan-13
|
||||
- titan-14
|
||||
- titan-15
|
||||
- titan-17
|
||||
- titan-18
|
||||
- titan-19
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24
|
||||
ssh_jump_host: ""
|
||||
@ -53,6 +65,15 @@ control_planes:
|
||||
workers: []
|
||||
local_bootstrap_paths:
|
||||
- infrastructure/core
|
||||
- clusters/atlas/flux-system
|
||||
- infrastructure/sources/helm
|
||||
- infrastructure/metallb
|
||||
- infrastructure/traefik
|
||||
- infrastructure/vault-csi
|
||||
- infrastructure/vault-injector
|
||||
- services/vault
|
||||
- infrastructure/postgres
|
||||
- services/gitea
|
||||
excluded_namespaces:
|
||||
- kube-system
|
||||
- kube-public
|
||||
@ -68,6 +89,8 @@ excluded_namespaces:
|
||||
startup:
|
||||
api_wait_seconds: 1200
|
||||
api_poll_seconds: 2
|
||||
auto_etcd_restore_on_api_failure: true
|
||||
etcd_restore_control_plane: titan-0a
|
||||
shutdown:
|
||||
default_budget_seconds: 1380
|
||||
skip_etcd_snapshot: false
|
||||
@ -95,6 +118,7 @@ coordination:
|
||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||
fallback_local_shutdown: false
|
||||
command_timeout_seconds: 25
|
||||
startup_guard_max_age_seconds: 900
|
||||
role: peer
|
||||
allow_startup_on_battery: false
|
||||
metrics:
|
||||
|
||||
@ -35,12 +35,25 @@ ssh_managed_nodes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
- titan-04
|
||||
- titan-05
|
||||
- titan-06
|
||||
- titan-07
|
||||
- titan-08
|
||||
- titan-09
|
||||
- titan-10
|
||||
- titan-11
|
||||
- titan-12
|
||||
- titan-13
|
||||
- titan-14
|
||||
- titan-15
|
||||
- titan-17
|
||||
- titan-18
|
||||
- titan-19
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24
|
||||
ssh_jump_host: ""
|
||||
ssh_jump_user: ""
|
||||
iac_repo_path: /opt/titan-iac
|
||||
@ -76,6 +89,8 @@ excluded_namespaces:
|
||||
startup:
|
||||
api_wait_seconds: 1200
|
||||
api_poll_seconds: 2
|
||||
auto_etcd_restore_on_api_failure: true
|
||||
etcd_restore_control_plane: titan-0a
|
||||
shutdown:
|
||||
default_budget_seconds: 1380
|
||||
skip_etcd_snapshot: false
|
||||
@ -104,6 +119,7 @@ coordination:
|
||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||
fallback_local_shutdown: true
|
||||
command_timeout_seconds: 25
|
||||
startup_guard_max_age_seconds: 900
|
||||
role: coordinator
|
||||
allow_startup_on_battery: false
|
||||
metrics:
|
||||
|
||||
@ -39,6 +39,11 @@ type ShutdownOptions struct {
|
||||
Reason string
|
||||
}
|
||||
|
||||
type EtcdRestoreOptions struct {
|
||||
ControlPlane string
|
||||
SnapshotPath string
|
||||
}
|
||||
|
||||
type startupWorkload struct {
|
||||
Namespace string
|
||||
Kind string
|
||||
@ -121,8 +126,21 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
apiAttempts = 1
|
||||
}
|
||||
if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil {
|
||||
if !o.cfg.Startup.AutoEtcdRestoreOnAPIFailure {
|
||||
return err
|
||||
}
|
||||
cp := strings.TrimSpace(o.cfg.Startup.EtcdRestoreControlPlane)
|
||||
if cp == "" && len(o.cfg.ControlPlanes) > 0 {
|
||||
cp = o.cfg.ControlPlanes[0]
|
||||
}
|
||||
o.log.Printf("warning: initial API wait failed (%v); attempting automatic etcd restore on %s", err, cp)
|
||||
if restoreErr := o.EtcdRestore(ctx, EtcdRestoreOptions{ControlPlane: cp}); restoreErr != nil {
|
||||
return fmt.Errorf("kubernetes API did not become reachable and automatic etcd restore failed: %w", restoreErr)
|
||||
}
|
||||
if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil {
|
||||
return fmt.Errorf("kubernetes API did not become reachable after automatic etcd restore: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
workers, err := o.effectiveWorkers(ctx)
|
||||
if err != nil {
|
||||
@ -200,6 +218,72 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) EtcdRestore(ctx context.Context, opts EtcdRestoreOptions) error {
|
||||
controlPlane := strings.TrimSpace(opts.ControlPlane)
|
||||
if controlPlane == "" {
|
||||
if len(o.cfg.ControlPlanes) == 0 {
|
||||
return fmt.Errorf("cannot restore etcd: no control planes configured")
|
||||
}
|
||||
controlPlane = o.cfg.ControlPlanes[0]
|
||||
}
|
||||
found := false
|
||||
for _, cp := range o.cfg.ControlPlanes {
|
||||
if cp == controlPlane {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
return fmt.Errorf("cannot restore etcd: control plane %s is not in configured control_planes", controlPlane)
|
||||
}
|
||||
if !o.sshManaged(controlPlane) {
|
||||
return fmt.Errorf("cannot restore etcd on %s: node not in ssh_managed_nodes", controlPlane)
|
||||
}
|
||||
|
||||
snapshotPath := strings.TrimSpace(opts.SnapshotPath)
|
||||
if snapshotPath == "" {
|
||||
resolved, err := o.latestEtcdSnapshotPath(ctx, controlPlane)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
snapshotPath = resolved
|
||||
}
|
||||
o.log.Printf("etcd restore target=%s snapshot=%s", controlPlane, snapshotPath)
|
||||
if o.runner.DryRun {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, cp := range o.cfg.ControlPlanes {
|
||||
cp := cp
|
||||
o.bestEffort("stop k3s before etcd restore on "+cp, func() error {
|
||||
_, err := o.ssh(ctx, cp, "sudo systemctl stop k3s || true")
|
||||
return err
|
||||
})
|
||||
}
|
||||
|
||||
restoreCmd := fmt.Sprintf("sudo k3s server --cluster-reset --cluster-reset-restore-path %q", snapshotPath)
|
||||
if _, err := o.ssh(ctx, controlPlane, restoreCmd); err != nil {
|
||||
return fmt.Errorf("etcd restore command failed on %s: %w", controlPlane, err)
|
||||
}
|
||||
o.log.Printf("etcd restore command completed on %s", controlPlane)
|
||||
|
||||
if _, err := o.ssh(ctx, controlPlane, "sudo systemctl start k3s || true"); err != nil {
|
||||
return fmt.Errorf("failed to start k3s on restore control plane %s: %w", controlPlane, err)
|
||||
}
|
||||
time.Sleep(10 * time.Second)
|
||||
for _, cp := range o.cfg.ControlPlanes {
|
||||
cp := cp
|
||||
if cp == controlPlane {
|
||||
continue
|
||||
}
|
||||
o.bestEffort("start k3s after etcd restore on "+cp, func() error {
|
||||
_, err := o.ssh(ctx, cp, "sudo systemctl start k3s || true")
|
||||
return err
|
||||
})
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err error) {
|
||||
unlock, err := state.AcquireLock(o.cfg.State.LockPath)
|
||||
if err != nil {
|
||||
@ -731,6 +815,22 @@ func (o *Orchestrator) takeEtcdSnapshot(ctx context.Context, node string) error
|
||||
return err
|
||||
}
|
||||
|
||||
func (o *Orchestrator) latestEtcdSnapshotPath(ctx context.Context, node string) (string, error) {
|
||||
if !o.sshManaged(node) {
|
||||
return "", fmt.Errorf("cannot resolve etcd snapshot on %s: node not in ssh_managed_nodes", node)
|
||||
}
|
||||
cmd := `sudo sh -lc 'ls -1t /var/lib/rancher/k3s/server/db/snapshots/* 2>/dev/null | head -n 1'`
|
||||
out, err := o.ssh(ctx, node, cmd)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("resolve latest etcd snapshot on %s: %w", node, err)
|
||||
}
|
||||
snapshot := strings.TrimSpace(out)
|
||||
if snapshot == "" {
|
||||
return "", fmt.Errorf("no etcd snapshots found on %s under /var/lib/rancher/k3s/server/db/snapshots", node)
|
||||
}
|
||||
return snapshot, nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) waitForAPI(ctx context.Context, attempts int, sleep time.Duration) error {
|
||||
if o.runner.DryRun {
|
||||
return nil
|
||||
|
||||
@ -35,6 +35,8 @@ type Config struct {
|
||||
type Startup struct {
|
||||
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
||||
APIPollSeconds int `yaml:"api_poll_seconds"`
|
||||
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
||||
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
||||
}
|
||||
|
||||
type Shutdown struct {
|
||||
@ -72,6 +74,7 @@ type Coordination struct {
|
||||
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
|
||||
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
|
||||
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
|
||||
StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"`
|
||||
Role string `yaml:"role"`
|
||||
AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"`
|
||||
}
|
||||
@ -135,6 +138,18 @@ func (c Config) Validate() error {
|
||||
if c.Startup.APIPollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.api_poll_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.EtcdRestoreControlPlane != "" {
|
||||
found := false
|
||||
for _, cp := range c.ControlPlanes {
|
||||
if cp == c.Startup.EtcdRestoreControlPlane {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
return fmt.Errorf("config.startup.etcd_restore_control_plane must be one of config.control_planes when set")
|
||||
}
|
||||
}
|
||||
if c.SSHPort <= 0 || c.SSHPort > 65535 {
|
||||
return fmt.Errorf("config.ssh_port must be in range 1-65535")
|
||||
}
|
||||
@ -156,6 +171,9 @@ func (c Config) Validate() error {
|
||||
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
|
||||
}
|
||||
}
|
||||
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
|
||||
return fmt.Errorf("config.coordination.startup_guard_max_age_seconds must be > 0")
|
||||
}
|
||||
if c.Coordination.Role != "coordinator" && c.Coordination.Role != "peer" {
|
||||
return fmt.Errorf("config.coordination.role must be coordinator or peer")
|
||||
}
|
||||
@ -202,6 +220,8 @@ func defaults() Config {
|
||||
Startup: Startup{
|
||||
APIWaitSeconds: 1200,
|
||||
APIPollSeconds: 2,
|
||||
AutoEtcdRestoreOnAPIFailure: true,
|
||||
EtcdRestoreControlPlane: "titan-0a",
|
||||
},
|
||||
Shutdown: Shutdown{
|
||||
DefaultBudgetSeconds: 1380,
|
||||
@ -224,6 +244,7 @@ func defaults() Config {
|
||||
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
|
||||
FallbackLocalShutdown: true,
|
||||
CommandTimeoutSeconds: 25,
|
||||
StartupGuardMaxAgeSec: 900,
|
||||
Role: "coordinator",
|
||||
AllowStartupOnBattery: false,
|
||||
},
|
||||
@ -256,6 +277,9 @@ func (c *Config) applyDefaults() {
|
||||
if c.Startup.APIPollSeconds <= 0 {
|
||||
c.Startup.APIPollSeconds = 2
|
||||
}
|
||||
if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 {
|
||||
c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0]
|
||||
}
|
||||
if c.SSHPort <= 0 {
|
||||
c.SSHPort = 2277
|
||||
}
|
||||
@ -292,6 +316,9 @@ func (c *Config) applyDefaults() {
|
||||
if c.Coordination.CommandTimeoutSeconds <= 0 {
|
||||
c.Coordination.CommandTimeoutSeconds = 25
|
||||
}
|
||||
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
|
||||
c.Coordination.StartupGuardMaxAgeSec = 900
|
||||
}
|
||||
if c.Coordination.Role == "" {
|
||||
c.Coordination.Role = "coordinator"
|
||||
}
|
||||
|
||||
@ -55,3 +55,41 @@ func TestValidateRejectsUnknownRole(t *testing.T) {
|
||||
t.Fatalf("expected validation error for unknown coordination role")
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Startup.EtcdRestoreControlPlane = "titan-missing"
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Fatalf("expected validation error for unknown etcd restore control plane")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadSetsCoordinationGuardDefaults(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
cfgPath := filepath.Join(tmp, "hecate.yaml")
|
||||
raw := `
|
||||
control_planes: [titan-0a, titan-0b, titan-0c]
|
||||
expected_flux_branch: main
|
||||
iac_repo_path: /opt/titan-iac
|
||||
coordination:
|
||||
role: coordinator
|
||||
ups:
|
||||
enabled: false
|
||||
state:
|
||||
run_history_path: /tmp/runs.json
|
||||
lock_path: /tmp/hecate.lock
|
||||
`
|
||||
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
|
||||
t.Fatalf("write config: %v", err)
|
||||
}
|
||||
cfg, err := Load(cfgPath)
|
||||
if err != nil {
|
||||
t.Fatalf("load config: %v", err)
|
||||
}
|
||||
if cfg.Coordination.StartupGuardMaxAgeSec <= 0 {
|
||||
t.Fatalf("expected startup guard max age default > 0, got %d", cfg.Coordination.StartupGuardMaxAgeSec)
|
||||
}
|
||||
if cfg.Startup.EtcdRestoreControlPlane == "" {
|
||||
t.Fatalf("expected startup etcd restore control plane default to be set")
|
||||
}
|
||||
}
|
||||
|
||||
@ -186,6 +186,23 @@ migrate_hecate_config() {
|
||||
echo "[install] migrated ssh_node_users titan-24 override to atlas"
|
||||
changed=1
|
||||
fi
|
||||
if grep -Eq '^ command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml" \
|
||||
&& ! grep -Eq '^ startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml"; then
|
||||
sed -Ei '/^ command_timeout_seconds:[[:space:]]*[0-9]+/a\ startup_guard_max_age_seconds: 900' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] added coordination.startup_guard_max_age_seconds=900"
|
||||
changed=1
|
||||
fi
|
||||
local default_restore_cp
|
||||
default_restore_cp="$(first_control_plane_name)"
|
||||
if [[ -z "${default_restore_cp}" ]]; then
|
||||
default_restore_cp="titan-0a"
|
||||
fi
|
||||
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/hecate.yaml" \
|
||||
&& ! grep -Eq '^ auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/hecate.yaml"; then
|
||||
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ auto_etcd_restore_on_api_failure: true\n etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
|
||||
changed=1
|
||||
fi
|
||||
|
||||
local role
|
||||
role="$(read_hecate_role)"
|
||||
@ -221,12 +238,25 @@ migrate_hecate_config() {
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
- titan-04
|
||||
- titan-05
|
||||
- titan-06
|
||||
- titan-07
|
||||
- titan-08
|
||||
- titan-09
|
||||
- titan-10
|
||||
- titan-11
|
||||
- titan-12
|
||||
- titan-13
|
||||
- titan-14
|
||||
- titan-15
|
||||
- titan-17
|
||||
- titan-18
|
||||
- titan-22'
|
||||
- titan-19
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24'
|
||||
elif [[ "${role}" == "peer" ]]; then
|
||||
inventory_block='ssh_node_hosts:
|
||||
titan-db: 192.168.22.10
|
||||
@ -257,11 +287,23 @@ migrate_hecate_config() {
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
- titan-04
|
||||
- titan-05
|
||||
- titan-06
|
||||
- titan-07
|
||||
- titan-08
|
||||
- titan-09
|
||||
- titan-10
|
||||
- titan-11
|
||||
- titan-12
|
||||
- titan-13
|
||||
- titan-14
|
||||
- titan-15
|
||||
- titan-17
|
||||
- titan-18
|
||||
- titan-19
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24'
|
||||
fi
|
||||
@ -280,6 +322,11 @@ migrate_hecate_config() {
|
||||
echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
|
||||
changed=1
|
||||
fi
|
||||
if ! grep -Eq '^ - titan-04$' "${CONF_DIR}/hecate.yaml" || ! grep -Eq '^ - titan-21$' "${CONF_DIR}/hecate.yaml"; then
|
||||
perl -0pi -e 's#ssh_managed_nodes:\n(?: - .*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
|
||||
changed=1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "${role}" == "peer" ]]; then
|
||||
@ -287,10 +334,22 @@ migrate_hecate_config() {
|
||||
&& grep -Eq '^ - titan-db$' "${CONF_DIR}/hecate.yaml" \
|
||||
&& grep -Eq '^ - titan-24$' "${CONF_DIR}/hecate.yaml" \
|
||||
&& ! grep -Eq '^ - titan-0a$' "${CONF_DIR}/hecate.yaml"; then
|
||||
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-12\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/hecate.yaml"
|
||||
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-04\n - titan-05\n - titan-06\n - titan-07\n - titan-08\n - titan-09\n - titan-10\n - titan-11\n - titan-12\n - titan-13\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-19\n - titan-20\n - titan-21\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
|
||||
changed=1
|
||||
fi
|
||||
|
||||
if ! grep -Eq '^ - services/gitea$' "${CONF_DIR}/hecate.yaml"; then
|
||||
perl -0pi -e 's#local_bootstrap_paths:\n(?: - .*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
|
||||
changed=1
|
||||
fi
|
||||
|
||||
if perl -0777 -ne 'exit(!(/local_bootstrap_paths:\n - infrastructure\/core\n/s))' "${CONF_DIR}/hecate.yaml"; then
|
||||
perl -0pi -e 's#local_bootstrap_paths:\n - infrastructure/core\n#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n#s' "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] expanded peer local_bootstrap_paths for full fallback bootstrap parity"
|
||||
changed=1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "${changed}" -eq 1 ]]; then
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user