hecate: harden peer bootstrap failover and worker fallback

This commit is contained in:
Brad Stein 2026-04-04 18:34:50 -03:00
parent 4b0fffd5e2
commit 985da478c6
9 changed files with 266 additions and 14 deletions

View File

@ -23,6 +23,7 @@ Hecate runs outside the cluster under systemd, so it can always orchestrate brin
Key startup guards:
- Startup is blocked on hosts configured as `coordination.role: peer` (unless `--allow-peer-startup` is used intentionally).
- `--auto-peer-failover` makes peer hosts hand off startup to the coordinator first, then run local startup only if the coordinator is unreachable.
- Startup is blocked while UPS is on battery by default (unless `--allow-on-battery` or `coordination.allow_startup_on_battery: true` is set).
- Startup is blocked when a shutdown intent is active (`/var/lib/hecate/intent.json`).
@ -45,7 +46,7 @@ The installer is idempotent:
Installer knobs (optional):
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` on this host.
- `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` preserves current bootstrap enablement state.
- `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` enables bootstrap by default.
- `HECATE_MANAGE_NUT=0` skips writing NUT/udev files.
- `HECATE_NUT_UPS_NAME` (default inferred from `/etc/hecate/hecate.yaml` target, fallback `pyrphoros`)
- `HECATE_NUT_VENDOR_ID` / `HECATE_NUT_PRODUCT_ID` (defaults `0764` / `0601`)
@ -77,6 +78,7 @@ Optional SSH jump/bastion:
Recommended:
- `titan-db` runs Hecate as the shutdown coordinator with UPS `Pyrphoros` (`pyrphoros@localhost`).
- `tethys` runs Hecate as a peer with UPS `Statera` (`statera@localhost`) and forwards shutdown triggers to `titan-db`.
- The bootstrap unit now runs on both roles; peer role uses auto-failover handoff to coordinator before local fallback startup.
- If forwarding fails, fallback local shutdown can remain enabled.
- Use `coordination.role: coordinator` on `titan-db` and `coordination.role: peer` on `tethys`.
@ -100,7 +102,7 @@ Power metrics:
- Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
- Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other.
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts).
- `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default.
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
## Disruptive startup drills

View File

@ -7,7 +7,9 @@ import (
"fmt"
"log"
"os"
"os/exec"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
@ -65,6 +67,8 @@ func runStartup(logger *log.Logger, args []string) error {
forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume")
skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies")
allowPeerStartup := fs.Bool("allow-peer-startup", false, "Allow startup to run on a peer instance")
autoPeerFailover := fs.Bool("auto-peer-failover", false, "On peer role, try coordinator bootstrap handoff first and only run local startup as fallback")
peerWaitSeconds := fs.Int("peer-wait-seconds", 180, "How long auto peer failover waits for coordinator handoff before local fallback startup")
allowOnBattery := fs.Bool("allow-on-battery", false, "Allow startup when UPS reports on-battery")
reason := fs.String("reason", "manual-startup", "Startup reason for run history")
_ = fs.Parse(args)
@ -73,9 +77,25 @@ func runStartup(logger *log.Logger, args []string) error {
if err != nil {
return err
}
allowPeer := *allowPeerStartup
if *execute {
if cfg.Coordination.Role == "peer" && !*allowPeerStartup {
return fmt.Errorf("startup blocked: this instance is configured as role=peer (use --allow-peer-startup to override)")
if cfg.Coordination.Role == "peer" && !allowPeer {
if *autoPeerFailover {
handoffCtx, cancel := context.WithTimeout(context.Background(), time.Duration(maxInt(*peerWaitSeconds, 1))*time.Second)
defer cancel()
handoff, handoffErr := tryPeerBootstrapHandoff(handoffCtx, cfg, logger)
if handoffErr != nil {
logger.Printf("warning: peer bootstrap handoff failed: %v", handoffErr)
}
if handoff {
logger.Printf("peer startup handoff complete; skipping local startup")
return nil
}
logger.Printf("peer startup handoff unavailable; proceeding with local peer startup fallback")
allowPeer = true
} else {
return fmt.Errorf("startup blocked: this instance is configured as role=peer (use --allow-peer-startup to override)")
}
}
if cfg.UPS.Enabled && !cfg.Coordination.AllowStartupOnBattery && !*allowOnBattery {
targets, targetErr := buildUPSTargets(cfg)
@ -270,3 +290,116 @@ Examples:
hecate status --config /etc/hecate/hecate.yaml
`)
}
func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log.Logger) (bool, error) {
coordinator := strings.TrimSpace(cfg.Coordination.ForwardShutdownHost)
if coordinator == "" {
return false, fmt.Errorf("coordination.forward_shutdown_host is empty for peer role")
}
user := strings.TrimSpace(cfg.Coordination.ForwardShutdownUser)
if user == "" {
if override, ok := cfg.SSHNodeUsers[coordinator]; ok && strings.TrimSpace(override) != "" {
user = strings.TrimSpace(override)
} else {
user = strings.TrimSpace(cfg.SSHUser)
}
}
host := coordinator
if mapped, ok := cfg.SSHNodeHosts[coordinator]; ok && strings.TrimSpace(mapped) != "" {
host = strings.TrimSpace(mapped)
}
target := host
if user != "" {
target = user + "@" + host
}
args := []string{
"-o", "BatchMode=yes",
"-o", "ConnectTimeout=8",
"-o", "StrictHostKeyChecking=accept-new",
}
if cfgPath := resolveSSHConfigFile(cfg); cfgPath != "" {
args = append(args, "-F", cfgPath)
}
if idPath := resolveSSHIdentityFile(cfg); idPath != "" {
args = append(args, "-i", idPath)
}
if cfg.SSHPort > 0 {
args = append(args, "-p", strconv.Itoa(cfg.SSHPort))
}
if cfg.SSHJumpHost != "" {
jump := cfg.SSHJumpHost
if cfg.SSHJumpUser != "" {
jump = cfg.SSHJumpUser + "@" + jump
}
if cfg.SSHPort > 0 && !strings.Contains(jump, ":") {
jump = fmt.Sprintf("%s:%d", jump, cfg.SSHPort)
}
args = append(args, "-J", jump)
}
remote := "sudo -n systemctl start hecate-bootstrap.service"
attempt := 1
for {
cmdArgs := append(append([]string{}, args...), target, remote)
cmd := exec.CommandContext(ctx, "ssh", cmdArgs...)
out, err := cmd.CombinedOutput()
if err == nil {
logger.Printf("peer bootstrap handoff succeeded on %s (attempt=%d)", coordinator, attempt)
return true, nil
}
trimmed := strings.TrimSpace(string(out))
if trimmed == "" {
logger.Printf("peer bootstrap handoff attempt %d failed for %s: %v", attempt, coordinator, err)
} else {
logger.Printf("peer bootstrap handoff attempt %d failed for %s: %v: %s", attempt, coordinator, err, trimmed)
}
select {
case <-ctx.Done():
return false, fmt.Errorf("coordinator handoff timeout for %s: %w", coordinator, ctx.Err())
case <-time.After(5 * time.Second):
attempt++
}
}
}
func resolveSSHConfigFile(cfg config.Config) string {
if strings.TrimSpace(cfg.SSHConfigFile) != "" {
return strings.TrimSpace(cfg.SSHConfigFile)
}
candidates := []string{
"/home/atlas/.ssh/config",
"/home/tethys/.ssh/config",
}
for _, p := range candidates {
if stat, err := os.Stat(p); err == nil && !stat.IsDir() {
return p
}
}
return ""
}
func resolveSSHIdentityFile(cfg config.Config) string {
if strings.TrimSpace(cfg.SSHIdentityFile) != "" {
return strings.TrimSpace(cfg.SSHIdentityFile)
}
candidates := []string{
"/home/atlas/.ssh/id_ed25519",
"/home/tethys/.ssh/id_ed25519",
}
for _, p := range candidates {
if stat, err := os.Stat(p); err == nil && !stat.IsDir() {
return p
}
}
return ""
}
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}

View File

@ -43,7 +43,7 @@ startup:
api_wait_seconds: 1200
api_poll_seconds: 2
shutdown:
default_budget_seconds: 300
default_budget_seconds: 1380
skip_etcd_snapshot: false
skip_drain: false
drain_parallelism: 6
@ -62,7 +62,7 @@ ups:
- name: Pyrphoros
target: pyrphoros@localhost
poll_seconds: 5
runtime_safety_factor: 1.10
runtime_safety_factor: 1.25
debounce_count: 3
telemetry_timeout_seconds: 90
coordination:

View File

@ -32,6 +32,15 @@ ssh_node_users:
titan-24: tethys
ssh_managed_nodes:
- titan-db
- titan-0a
- titan-0b
- titan-0c
- titan-12
- titan-14
- titan-15
- titan-17
- titan-18
- titan-22
- titan-24
ssh_jump_host: ""
ssh_jump_user: ""
@ -60,7 +69,7 @@ startup:
api_wait_seconds: 1200
api_poll_seconds: 2
shutdown:
default_budget_seconds: 300
default_budget_seconds: 1380
skip_etcd_snapshot: false
skip_drain: false
drain_parallelism: 6
@ -77,7 +86,7 @@ ups:
- name: Statera
target: statera@localhost
poll_seconds: 5
runtime_safety_factor: 1.10
runtime_safety_factor: 1.25
debounce_count: 3
telemetry_timeout_seconds: 90
coordination:

View File

@ -77,7 +77,7 @@ startup:
api_wait_seconds: 1200
api_poll_seconds: 2
shutdown:
default_budget_seconds: 300
default_budget_seconds: 1380
skip_etcd_snapshot: false
skip_drain: false
drain_parallelism: 6
@ -95,7 +95,7 @@ ups:
- name: Pyrphoros
target: pyrphoros@localhost
poll_seconds: 5
runtime_safety_factor: 1.10
runtime_safety_factor: 1.25
debounce_count: 3
telemetry_timeout_seconds: 90
coordination:

View File

@ -3,12 +3,15 @@ Description=Hecate Staged Cluster Bootstrap
Wants=network-online.target
After=network-online.target
ConditionPathExists=/etc/hecate/hecate.yaml
StartLimitIntervalSec=0
[Service]
Type=oneshot
User=root
Group=root
ExecStart=/usr/local/bin/hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main
ExecStart=/usr/local/bin/hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180
Restart=on-failure
RestartSec=30
TimeoutStartSec=1800
[Install]

View File

@ -279,7 +279,16 @@ func (o *Orchestrator) effectiveWorkers(ctx context.Context) ([]string, error) {
if len(o.cfg.Workers) > 0 {
return append([]string{}, o.cfg.Workers...), nil
}
return o.discoverWorkers(ctx)
workers, err := o.discoverWorkers(ctx)
if err == nil {
return workers, nil
}
fallback := o.fallbackWorkersFromInventory()
if len(fallback) == 0 {
return nil, err
}
o.log.Printf("warning: worker discovery failed via kubernetes API (%v); falling back to inventory workers=%s", err, strings.Join(fallback, ","))
return fallback, nil
}
func (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error) {
@ -307,6 +316,41 @@ func (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error) {
return workers, nil
}
func (o *Orchestrator) fallbackWorkersFromInventory() []string {
cp := make(map[string]struct{}, len(o.cfg.ControlPlanes))
for _, node := range o.cfg.ControlPlanes {
cp[strings.TrimSpace(node)] = struct{}{}
}
candidates := make(map[string]struct{})
add := func(node string) {
name := strings.TrimSpace(node)
if name == "" {
return
}
if _, isCP := cp[name]; isCP {
return
}
candidates[name] = struct{}{}
}
for _, node := range o.cfg.SSHManagedNodes {
add(node)
}
if len(candidates) == 0 {
for node := range o.cfg.SSHNodeHosts {
add(node)
}
}
workers := make([]string, 0, len(candidates))
for node := range candidates {
workers = append(workers, node)
}
sort.Strings(workers)
return workers
}
func (o *Orchestrator) patchFluxSuspendAll(ctx context.Context, suspend bool) error {
patch := fmt.Sprintf(`{"spec":{"suspend":%t}}`, suspend)

View File

@ -1,6 +1,13 @@
package cluster
import "testing"
import (
"log"
"os"
"reflect"
"testing"
"scm.bstein.dev/bstein/hecate/internal/config"
)
func TestParseVaultSealed(t *testing.T) {
sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`)
@ -36,3 +43,42 @@ func TestParseVaultSealedWithKubectlPreamble(t *testing.T) {
t.Fatalf("expected sealed=true from payload with preamble")
}
}
func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) {
orch := &Orchestrator{
cfg: config.Config{
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
SSHManagedNodes: []string{
"titan-db",
"titan-0a",
"titan-15",
"titan-17",
},
},
log: log.New(os.Stdout, "", 0),
}
got := orch.fallbackWorkersFromInventory()
want := []string{"titan-15", "titan-17", "titan-db"}
if !reflect.DeepEqual(got, want) {
t.Fatalf("fallback workers mismatch: got=%v want=%v", got, want)
}
}
func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) {
orch := &Orchestrator{
cfg: config.Config{
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
SSHNodeHosts: map[string]string{
"titan-0a": "192.168.22.11",
"titan-22": "192.168.22.22",
"titan-24": "192.168.22.26",
},
},
log: log.New(os.Stdout, "", 0),
}
got := orch.fallbackWorkersFromInventory()
want := []string{"titan-22", "titan-24"}
if !reflect.DeepEqual(got, want) {
t.Fatalf("fallback workers mismatch: got=%v want=%v", got, want)
}
}

View File

@ -58,6 +58,19 @@ resolve_nut_ups_name() {
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
}
read_hecate_role() {
if [[ ! -f "${CONF_DIR}/hecate.yaml" ]]; then
echo "coordinator"
return 0
fi
local role
role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/hecate.yaml" 2>/dev/null || true)"
if [[ -z "${role}" ]]; then
role="coordinator"
fi
echo "${role}"
}
ensure_apt_packages() {
local missing=()
for pkg in "$@"; do
@ -208,7 +221,9 @@ if [[ "${ENABLE_BOOTSTRAP}" == "1" ]]; then
elif [[ "${ENABLE_BOOTSTRAP}" == "0" ]]; then
systemctl disable hecate-bootstrap.service >/dev/null 2>&1 || true
else
echo "[install] leaving hecate-bootstrap.service state unchanged (HECATE_ENABLE_BOOTSTRAP=${ENABLE_BOOTSTRAP})"
role="$(read_hecate_role)"
systemctl enable hecate-bootstrap.service
echo "[install] auto-enabled hecate-bootstrap.service for role=${role}"
fi
if [[ "${START_NOW}" -eq 1 ]]; then