From 985da478c6e1df7251a2895c8a087a461bbf606d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 4 Apr 2026 18:34:50 -0300 Subject: [PATCH] hecate: harden peer bootstrap failover and worker fallback --- README.md | 6 +- cmd/hecate/main.go | 137 +++++++++++++++++++++++- configs/hecate.example.yaml | 4 +- configs/hecate.tethys.yaml | 13 ++- configs/hecate.titan-db.yaml | 4 +- deploy/systemd/hecate-bootstrap.service | 5 +- internal/cluster/orchestrator.go | 46 +++++++- internal/cluster/orchestrator_test.go | 48 ++++++++- scripts/install.sh | 17 ++- 9 files changed, 266 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index cf5ae61..627d81f 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ Hecate runs outside the cluster under systemd, so it can always orchestrate brin Key startup guards: - Startup is blocked on hosts configured as `coordination.role: peer` (unless `--allow-peer-startup` is used intentionally). +- `--auto-peer-failover` makes peer hosts hand off startup to the coordinator first, then run local startup only if the coordinator is unreachable. - Startup is blocked while UPS is on battery by default (unless `--allow-on-battery` or `coordination.allow_startup_on_battery: true` is set). - Startup is blocked when a shutdown intent is active (`/var/lib/hecate/intent.json`). @@ -45,7 +46,7 @@ The installer is idempotent: Installer knobs (optional): - `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` on this host. -- `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` preserves current bootstrap enablement state. +- `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` enables bootstrap by default. - `HECATE_MANAGE_NUT=0` skips writing NUT/udev files. - `HECATE_NUT_UPS_NAME` (default inferred from `/etc/hecate/hecate.yaml` target, fallback `pyrphoros`) - `HECATE_NUT_VENDOR_ID` / `HECATE_NUT_PRODUCT_ID` (defaults `0764` / `0601`) @@ -77,6 +78,7 @@ Optional SSH jump/bastion: Recommended: - `titan-db` runs Hecate as the shutdown coordinator with UPS `Pyrphoros` (`pyrphoros@localhost`). - `tethys` runs Hecate as a peer with UPS `Statera` (`statera@localhost`) and forwards shutdown triggers to `titan-db`. +- The bootstrap unit now runs on both roles; peer role uses auto-failover handoff to coordinator before local fallback startup. - If forwarding fails, fallback local shutdown can remain enabled. - Use `coordination.role: coordinator` on `titan-db` and `coordination.role: peer` on `tethys`. @@ -100,7 +102,7 @@ Power metrics: - Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set. - Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other. - `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically. -- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts). +- `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default. - `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively. ## Disruptive startup drills diff --git a/cmd/hecate/main.go b/cmd/hecate/main.go index fac5f4f..b53d411 100644 --- a/cmd/hecate/main.go +++ b/cmd/hecate/main.go @@ -7,7 +7,9 @@ import ( "fmt" "log" "os" + "os/exec" "os/signal" + "strconv" "strings" "syscall" "time" @@ -65,6 +67,8 @@ func runStartup(logger *log.Logger, args []string) error { forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume") skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies") allowPeerStartup := fs.Bool("allow-peer-startup", false, "Allow startup to run on a peer instance") + autoPeerFailover := fs.Bool("auto-peer-failover", false, "On peer role, try coordinator bootstrap handoff first and only run local startup as fallback") + peerWaitSeconds := fs.Int("peer-wait-seconds", 180, "How long auto peer failover waits for coordinator handoff before local fallback startup") allowOnBattery := fs.Bool("allow-on-battery", false, "Allow startup when UPS reports on-battery") reason := fs.String("reason", "manual-startup", "Startup reason for run history") _ = fs.Parse(args) @@ -73,9 +77,25 @@ func runStartup(logger *log.Logger, args []string) error { if err != nil { return err } + allowPeer := *allowPeerStartup if *execute { - if cfg.Coordination.Role == "peer" && !*allowPeerStartup { - return fmt.Errorf("startup blocked: this instance is configured as role=peer (use --allow-peer-startup to override)") + if cfg.Coordination.Role == "peer" && !allowPeer { + if *autoPeerFailover { + handoffCtx, cancel := context.WithTimeout(context.Background(), time.Duration(maxInt(*peerWaitSeconds, 1))*time.Second) + defer cancel() + handoff, handoffErr := tryPeerBootstrapHandoff(handoffCtx, cfg, logger) + if handoffErr != nil { + logger.Printf("warning: peer bootstrap handoff failed: %v", handoffErr) + } + if handoff { + logger.Printf("peer startup handoff complete; skipping local startup") + return nil + } + logger.Printf("peer startup handoff unavailable; proceeding with local peer startup fallback") + allowPeer = true + } else { + return fmt.Errorf("startup blocked: this instance is configured as role=peer (use --allow-peer-startup to override)") + } } if cfg.UPS.Enabled && !cfg.Coordination.AllowStartupOnBattery && !*allowOnBattery { targets, targetErr := buildUPSTargets(cfg) @@ -270,3 +290,116 @@ Examples: hecate status --config /etc/hecate/hecate.yaml `) } + +func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log.Logger) (bool, error) { + coordinator := strings.TrimSpace(cfg.Coordination.ForwardShutdownHost) + if coordinator == "" { + return false, fmt.Errorf("coordination.forward_shutdown_host is empty for peer role") + } + user := strings.TrimSpace(cfg.Coordination.ForwardShutdownUser) + if user == "" { + if override, ok := cfg.SSHNodeUsers[coordinator]; ok && strings.TrimSpace(override) != "" { + user = strings.TrimSpace(override) + } else { + user = strings.TrimSpace(cfg.SSHUser) + } + } + + host := coordinator + if mapped, ok := cfg.SSHNodeHosts[coordinator]; ok && strings.TrimSpace(mapped) != "" { + host = strings.TrimSpace(mapped) + } + target := host + if user != "" { + target = user + "@" + host + } + + args := []string{ + "-o", "BatchMode=yes", + "-o", "ConnectTimeout=8", + "-o", "StrictHostKeyChecking=accept-new", + } + if cfgPath := resolveSSHConfigFile(cfg); cfgPath != "" { + args = append(args, "-F", cfgPath) + } + if idPath := resolveSSHIdentityFile(cfg); idPath != "" { + args = append(args, "-i", idPath) + } + if cfg.SSHPort > 0 { + args = append(args, "-p", strconv.Itoa(cfg.SSHPort)) + } + if cfg.SSHJumpHost != "" { + jump := cfg.SSHJumpHost + if cfg.SSHJumpUser != "" { + jump = cfg.SSHJumpUser + "@" + jump + } + if cfg.SSHPort > 0 && !strings.Contains(jump, ":") { + jump = fmt.Sprintf("%s:%d", jump, cfg.SSHPort) + } + args = append(args, "-J", jump) + } + + remote := "sudo -n systemctl start hecate-bootstrap.service" + attempt := 1 + for { + cmdArgs := append(append([]string{}, args...), target, remote) + cmd := exec.CommandContext(ctx, "ssh", cmdArgs...) + out, err := cmd.CombinedOutput() + if err == nil { + logger.Printf("peer bootstrap handoff succeeded on %s (attempt=%d)", coordinator, attempt) + return true, nil + } + trimmed := strings.TrimSpace(string(out)) + if trimmed == "" { + logger.Printf("peer bootstrap handoff attempt %d failed for %s: %v", attempt, coordinator, err) + } else { + logger.Printf("peer bootstrap handoff attempt %d failed for %s: %v: %s", attempt, coordinator, err, trimmed) + } + + select { + case <-ctx.Done(): + return false, fmt.Errorf("coordinator handoff timeout for %s: %w", coordinator, ctx.Err()) + case <-time.After(5 * time.Second): + attempt++ + } + } +} + +func resolveSSHConfigFile(cfg config.Config) string { + if strings.TrimSpace(cfg.SSHConfigFile) != "" { + return strings.TrimSpace(cfg.SSHConfigFile) + } + candidates := []string{ + "/home/atlas/.ssh/config", + "/home/tethys/.ssh/config", + } + for _, p := range candidates { + if stat, err := os.Stat(p); err == nil && !stat.IsDir() { + return p + } + } + return "" +} + +func resolveSSHIdentityFile(cfg config.Config) string { + if strings.TrimSpace(cfg.SSHIdentityFile) != "" { + return strings.TrimSpace(cfg.SSHIdentityFile) + } + candidates := []string{ + "/home/atlas/.ssh/id_ed25519", + "/home/tethys/.ssh/id_ed25519", + } + for _, p := range candidates { + if stat, err := os.Stat(p); err == nil && !stat.IsDir() { + return p + } + } + return "" +} + +func maxInt(a, b int) int { + if a > b { + return a + } + return b +} diff --git a/configs/hecate.example.yaml b/configs/hecate.example.yaml index e1eacea..947abf0 100644 --- a/configs/hecate.example.yaml +++ b/configs/hecate.example.yaml @@ -43,7 +43,7 @@ startup: api_wait_seconds: 1200 api_poll_seconds: 2 shutdown: - default_budget_seconds: 300 + default_budget_seconds: 1380 skip_etcd_snapshot: false skip_drain: false drain_parallelism: 6 @@ -62,7 +62,7 @@ ups: - name: Pyrphoros target: pyrphoros@localhost poll_seconds: 5 - runtime_safety_factor: 1.10 + runtime_safety_factor: 1.25 debounce_count: 3 telemetry_timeout_seconds: 90 coordination: diff --git a/configs/hecate.tethys.yaml b/configs/hecate.tethys.yaml index ea195d1..04991a4 100644 --- a/configs/hecate.tethys.yaml +++ b/configs/hecate.tethys.yaml @@ -32,6 +32,15 @@ ssh_node_users: titan-24: tethys ssh_managed_nodes: - titan-db + - titan-0a + - titan-0b + - titan-0c + - titan-12 + - titan-14 + - titan-15 + - titan-17 + - titan-18 + - titan-22 - titan-24 ssh_jump_host: "" ssh_jump_user: "" @@ -60,7 +69,7 @@ startup: api_wait_seconds: 1200 api_poll_seconds: 2 shutdown: - default_budget_seconds: 300 + default_budget_seconds: 1380 skip_etcd_snapshot: false skip_drain: false drain_parallelism: 6 @@ -77,7 +86,7 @@ ups: - name: Statera target: statera@localhost poll_seconds: 5 - runtime_safety_factor: 1.10 + runtime_safety_factor: 1.25 debounce_count: 3 telemetry_timeout_seconds: 90 coordination: diff --git a/configs/hecate.titan-db.yaml b/configs/hecate.titan-db.yaml index 8d7689e..697cac6 100644 --- a/configs/hecate.titan-db.yaml +++ b/configs/hecate.titan-db.yaml @@ -77,7 +77,7 @@ startup: api_wait_seconds: 1200 api_poll_seconds: 2 shutdown: - default_budget_seconds: 300 + default_budget_seconds: 1380 skip_etcd_snapshot: false skip_drain: false drain_parallelism: 6 @@ -95,7 +95,7 @@ ups: - name: Pyrphoros target: pyrphoros@localhost poll_seconds: 5 - runtime_safety_factor: 1.10 + runtime_safety_factor: 1.25 debounce_count: 3 telemetry_timeout_seconds: 90 coordination: diff --git a/deploy/systemd/hecate-bootstrap.service b/deploy/systemd/hecate-bootstrap.service index 526cf9b..4aeb962 100644 --- a/deploy/systemd/hecate-bootstrap.service +++ b/deploy/systemd/hecate-bootstrap.service @@ -3,12 +3,15 @@ Description=Hecate Staged Cluster Bootstrap Wants=network-online.target After=network-online.target ConditionPathExists=/etc/hecate/hecate.yaml +StartLimitIntervalSec=0 [Service] Type=oneshot User=root Group=root -ExecStart=/usr/local/bin/hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main +ExecStart=/usr/local/bin/hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180 +Restart=on-failure +RestartSec=30 TimeoutStartSec=1800 [Install] diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index a946f53..e452e55 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -279,7 +279,16 @@ func (o *Orchestrator) effectiveWorkers(ctx context.Context) ([]string, error) { if len(o.cfg.Workers) > 0 { return append([]string{}, o.cfg.Workers...), nil } - return o.discoverWorkers(ctx) + workers, err := o.discoverWorkers(ctx) + if err == nil { + return workers, nil + } + fallback := o.fallbackWorkersFromInventory() + if len(fallback) == 0 { + return nil, err + } + o.log.Printf("warning: worker discovery failed via kubernetes API (%v); falling back to inventory workers=%s", err, strings.Join(fallback, ",")) + return fallback, nil } func (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error) { @@ -307,6 +316,41 @@ func (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error) { return workers, nil } +func (o *Orchestrator) fallbackWorkersFromInventory() []string { + cp := make(map[string]struct{}, len(o.cfg.ControlPlanes)) + for _, node := range o.cfg.ControlPlanes { + cp[strings.TrimSpace(node)] = struct{}{} + } + + candidates := make(map[string]struct{}) + add := func(node string) { + name := strings.TrimSpace(node) + if name == "" { + return + } + if _, isCP := cp[name]; isCP { + return + } + candidates[name] = struct{}{} + } + + for _, node := range o.cfg.SSHManagedNodes { + add(node) + } + if len(candidates) == 0 { + for node := range o.cfg.SSHNodeHosts { + add(node) + } + } + + workers := make([]string, 0, len(candidates)) + for node := range candidates { + workers = append(workers, node) + } + sort.Strings(workers) + return workers +} + func (o *Orchestrator) patchFluxSuspendAll(ctx context.Context, suspend bool) error { patch := fmt.Sprintf(`{"spec":{"suspend":%t}}`, suspend) diff --git a/internal/cluster/orchestrator_test.go b/internal/cluster/orchestrator_test.go index 09f6808..6cdc6ef 100644 --- a/internal/cluster/orchestrator_test.go +++ b/internal/cluster/orchestrator_test.go @@ -1,6 +1,13 @@ package cluster -import "testing" +import ( + "log" + "os" + "reflect" + "testing" + + "scm.bstein.dev/bstein/hecate/internal/config" +) func TestParseVaultSealed(t *testing.T) { sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`) @@ -36,3 +43,42 @@ func TestParseVaultSealedWithKubectlPreamble(t *testing.T) { t.Fatalf("expected sealed=true from payload with preamble") } } + +func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) { + orch := &Orchestrator{ + cfg: config.Config{ + ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"}, + SSHManagedNodes: []string{ + "titan-db", + "titan-0a", + "titan-15", + "titan-17", + }, + }, + log: log.New(os.Stdout, "", 0), + } + got := orch.fallbackWorkersFromInventory() + want := []string{"titan-15", "titan-17", "titan-db"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("fallback workers mismatch: got=%v want=%v", got, want) + } +} + +func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) { + orch := &Orchestrator{ + cfg: config.Config{ + ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"}, + SSHNodeHosts: map[string]string{ + "titan-0a": "192.168.22.11", + "titan-22": "192.168.22.22", + "titan-24": "192.168.22.26", + }, + }, + log: log.New(os.Stdout, "", 0), + } + got := orch.fallbackWorkersFromInventory() + want := []string{"titan-22", "titan-24"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("fallback workers mismatch: got=%v want=%v", got, want) + } +} diff --git a/scripts/install.sh b/scripts/install.sh index f973251..4d955f9 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -58,6 +58,19 @@ resolve_nut_ups_name() { echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}" } +read_hecate_role() { + if [[ ! -f "${CONF_DIR}/hecate.yaml" ]]; then + echo "coordinator" + return 0 + fi + local role + role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/hecate.yaml" 2>/dev/null || true)" + if [[ -z "${role}" ]]; then + role="coordinator" + fi + echo "${role}" +} + ensure_apt_packages() { local missing=() for pkg in "$@"; do @@ -208,7 +221,9 @@ if [[ "${ENABLE_BOOTSTRAP}" == "1" ]]; then elif [[ "${ENABLE_BOOTSTRAP}" == "0" ]]; then systemctl disable hecate-bootstrap.service >/dev/null 2>&1 || true else - echo "[install] leaving hecate-bootstrap.service state unchanged (HECATE_ENABLE_BOOTSTRAP=${ENABLE_BOOTSTRAP})" + role="$(read_hecate_role)" + systemctl enable hecate-bootstrap.service + echo "[install] auto-enabled hecate-bootstrap.service for role=${role}" fi if [[ "${START_NOW}" -eq 1 ]]; then