hecate: harden peer bootstrap failover and worker fallback
This commit is contained in:
parent
4b0fffd5e2
commit
985da478c6
@ -23,6 +23,7 @@ Hecate runs outside the cluster under systemd, so it can always orchestrate brin
|
||||
|
||||
Key startup guards:
|
||||
- Startup is blocked on hosts configured as `coordination.role: peer` (unless `--allow-peer-startup` is used intentionally).
|
||||
- `--auto-peer-failover` makes peer hosts hand off startup to the coordinator first, then run local startup only if the coordinator is unreachable.
|
||||
- Startup is blocked while UPS is on battery by default (unless `--allow-on-battery` or `coordination.allow_startup_on_battery: true` is set).
|
||||
- Startup is blocked when a shutdown intent is active (`/var/lib/hecate/intent.json`).
|
||||
|
||||
@ -45,7 +46,7 @@ The installer is idempotent:
|
||||
|
||||
Installer knobs (optional):
|
||||
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` on this host.
|
||||
- `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` preserves current bootstrap enablement state.
|
||||
- `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` enables bootstrap by default.
|
||||
- `HECATE_MANAGE_NUT=0` skips writing NUT/udev files.
|
||||
- `HECATE_NUT_UPS_NAME` (default inferred from `/etc/hecate/hecate.yaml` target, fallback `pyrphoros`)
|
||||
- `HECATE_NUT_VENDOR_ID` / `HECATE_NUT_PRODUCT_ID` (defaults `0764` / `0601`)
|
||||
@ -77,6 +78,7 @@ Optional SSH jump/bastion:
|
||||
Recommended:
|
||||
- `titan-db` runs Hecate as the shutdown coordinator with UPS `Pyrphoros` (`pyrphoros@localhost`).
|
||||
- `tethys` runs Hecate as a peer with UPS `Statera` (`statera@localhost`) and forwards shutdown triggers to `titan-db`.
|
||||
- The bootstrap unit now runs on both roles; peer role uses auto-failover handoff to coordinator before local fallback startup.
|
||||
- If forwarding fails, fallback local shutdown can remain enabled.
|
||||
- Use `coordination.role: coordinator` on `titan-db` and `coordination.role: peer` on `tethys`.
|
||||
|
||||
@ -100,7 +102,7 @@ Power metrics:
|
||||
- Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
|
||||
- Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other.
|
||||
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
|
||||
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts).
|
||||
- `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default.
|
||||
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
|
||||
|
||||
## Disruptive startup drills
|
||||
|
||||
@ -7,7 +7,9 @@ import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
@ -65,6 +67,8 @@ func runStartup(logger *log.Logger, args []string) error {
|
||||
forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume")
|
||||
skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies")
|
||||
allowPeerStartup := fs.Bool("allow-peer-startup", false, "Allow startup to run on a peer instance")
|
||||
autoPeerFailover := fs.Bool("auto-peer-failover", false, "On peer role, try coordinator bootstrap handoff first and only run local startup as fallback")
|
||||
peerWaitSeconds := fs.Int("peer-wait-seconds", 180, "How long auto peer failover waits for coordinator handoff before local fallback startup")
|
||||
allowOnBattery := fs.Bool("allow-on-battery", false, "Allow startup when UPS reports on-battery")
|
||||
reason := fs.String("reason", "manual-startup", "Startup reason for run history")
|
||||
_ = fs.Parse(args)
|
||||
@ -73,9 +77,25 @@ func runStartup(logger *log.Logger, args []string) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
allowPeer := *allowPeerStartup
|
||||
if *execute {
|
||||
if cfg.Coordination.Role == "peer" && !*allowPeerStartup {
|
||||
return fmt.Errorf("startup blocked: this instance is configured as role=peer (use --allow-peer-startup to override)")
|
||||
if cfg.Coordination.Role == "peer" && !allowPeer {
|
||||
if *autoPeerFailover {
|
||||
handoffCtx, cancel := context.WithTimeout(context.Background(), time.Duration(maxInt(*peerWaitSeconds, 1))*time.Second)
|
||||
defer cancel()
|
||||
handoff, handoffErr := tryPeerBootstrapHandoff(handoffCtx, cfg, logger)
|
||||
if handoffErr != nil {
|
||||
logger.Printf("warning: peer bootstrap handoff failed: %v", handoffErr)
|
||||
}
|
||||
if handoff {
|
||||
logger.Printf("peer startup handoff complete; skipping local startup")
|
||||
return nil
|
||||
}
|
||||
logger.Printf("peer startup handoff unavailable; proceeding with local peer startup fallback")
|
||||
allowPeer = true
|
||||
} else {
|
||||
return fmt.Errorf("startup blocked: this instance is configured as role=peer (use --allow-peer-startup to override)")
|
||||
}
|
||||
}
|
||||
if cfg.UPS.Enabled && !cfg.Coordination.AllowStartupOnBattery && !*allowOnBattery {
|
||||
targets, targetErr := buildUPSTargets(cfg)
|
||||
@ -270,3 +290,116 @@ Examples:
|
||||
hecate status --config /etc/hecate/hecate.yaml
|
||||
`)
|
||||
}
|
||||
|
||||
func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log.Logger) (bool, error) {
|
||||
coordinator := strings.TrimSpace(cfg.Coordination.ForwardShutdownHost)
|
||||
if coordinator == "" {
|
||||
return false, fmt.Errorf("coordination.forward_shutdown_host is empty for peer role")
|
||||
}
|
||||
user := strings.TrimSpace(cfg.Coordination.ForwardShutdownUser)
|
||||
if user == "" {
|
||||
if override, ok := cfg.SSHNodeUsers[coordinator]; ok && strings.TrimSpace(override) != "" {
|
||||
user = strings.TrimSpace(override)
|
||||
} else {
|
||||
user = strings.TrimSpace(cfg.SSHUser)
|
||||
}
|
||||
}
|
||||
|
||||
host := coordinator
|
||||
if mapped, ok := cfg.SSHNodeHosts[coordinator]; ok && strings.TrimSpace(mapped) != "" {
|
||||
host = strings.TrimSpace(mapped)
|
||||
}
|
||||
target := host
|
||||
if user != "" {
|
||||
target = user + "@" + host
|
||||
}
|
||||
|
||||
args := []string{
|
||||
"-o", "BatchMode=yes",
|
||||
"-o", "ConnectTimeout=8",
|
||||
"-o", "StrictHostKeyChecking=accept-new",
|
||||
}
|
||||
if cfgPath := resolveSSHConfigFile(cfg); cfgPath != "" {
|
||||
args = append(args, "-F", cfgPath)
|
||||
}
|
||||
if idPath := resolveSSHIdentityFile(cfg); idPath != "" {
|
||||
args = append(args, "-i", idPath)
|
||||
}
|
||||
if cfg.SSHPort > 0 {
|
||||
args = append(args, "-p", strconv.Itoa(cfg.SSHPort))
|
||||
}
|
||||
if cfg.SSHJumpHost != "" {
|
||||
jump := cfg.SSHJumpHost
|
||||
if cfg.SSHJumpUser != "" {
|
||||
jump = cfg.SSHJumpUser + "@" + jump
|
||||
}
|
||||
if cfg.SSHPort > 0 && !strings.Contains(jump, ":") {
|
||||
jump = fmt.Sprintf("%s:%d", jump, cfg.SSHPort)
|
||||
}
|
||||
args = append(args, "-J", jump)
|
||||
}
|
||||
|
||||
remote := "sudo -n systemctl start hecate-bootstrap.service"
|
||||
attempt := 1
|
||||
for {
|
||||
cmdArgs := append(append([]string{}, args...), target, remote)
|
||||
cmd := exec.CommandContext(ctx, "ssh", cmdArgs...)
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err == nil {
|
||||
logger.Printf("peer bootstrap handoff succeeded on %s (attempt=%d)", coordinator, attempt)
|
||||
return true, nil
|
||||
}
|
||||
trimmed := strings.TrimSpace(string(out))
|
||||
if trimmed == "" {
|
||||
logger.Printf("peer bootstrap handoff attempt %d failed for %s: %v", attempt, coordinator, err)
|
||||
} else {
|
||||
logger.Printf("peer bootstrap handoff attempt %d failed for %s: %v: %s", attempt, coordinator, err, trimmed)
|
||||
}
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return false, fmt.Errorf("coordinator handoff timeout for %s: %w", coordinator, ctx.Err())
|
||||
case <-time.After(5 * time.Second):
|
||||
attempt++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func resolveSSHConfigFile(cfg config.Config) string {
|
||||
if strings.TrimSpace(cfg.SSHConfigFile) != "" {
|
||||
return strings.TrimSpace(cfg.SSHConfigFile)
|
||||
}
|
||||
candidates := []string{
|
||||
"/home/atlas/.ssh/config",
|
||||
"/home/tethys/.ssh/config",
|
||||
}
|
||||
for _, p := range candidates {
|
||||
if stat, err := os.Stat(p); err == nil && !stat.IsDir() {
|
||||
return p
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func resolveSSHIdentityFile(cfg config.Config) string {
|
||||
if strings.TrimSpace(cfg.SSHIdentityFile) != "" {
|
||||
return strings.TrimSpace(cfg.SSHIdentityFile)
|
||||
}
|
||||
candidates := []string{
|
||||
"/home/atlas/.ssh/id_ed25519",
|
||||
"/home/tethys/.ssh/id_ed25519",
|
||||
}
|
||||
for _, p := range candidates {
|
||||
if stat, err := os.Stat(p); err == nil && !stat.IsDir() {
|
||||
return p
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func maxInt(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
@ -43,7 +43,7 @@ startup:
|
||||
api_wait_seconds: 1200
|
||||
api_poll_seconds: 2
|
||||
shutdown:
|
||||
default_budget_seconds: 300
|
||||
default_budget_seconds: 1380
|
||||
skip_etcd_snapshot: false
|
||||
skip_drain: false
|
||||
drain_parallelism: 6
|
||||
@ -62,7 +62,7 @@ ups:
|
||||
- name: Pyrphoros
|
||||
target: pyrphoros@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.10
|
||||
runtime_safety_factor: 1.25
|
||||
debounce_count: 3
|
||||
telemetry_timeout_seconds: 90
|
||||
coordination:
|
||||
|
||||
@ -32,6 +32,15 @@ ssh_node_users:
|
||||
titan-24: tethys
|
||||
ssh_managed_nodes:
|
||||
- titan-db
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
- titan-12
|
||||
- titan-14
|
||||
- titan-15
|
||||
- titan-17
|
||||
- titan-18
|
||||
- titan-22
|
||||
- titan-24
|
||||
ssh_jump_host: ""
|
||||
ssh_jump_user: ""
|
||||
@ -60,7 +69,7 @@ startup:
|
||||
api_wait_seconds: 1200
|
||||
api_poll_seconds: 2
|
||||
shutdown:
|
||||
default_budget_seconds: 300
|
||||
default_budget_seconds: 1380
|
||||
skip_etcd_snapshot: false
|
||||
skip_drain: false
|
||||
drain_parallelism: 6
|
||||
@ -77,7 +86,7 @@ ups:
|
||||
- name: Statera
|
||||
target: statera@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.10
|
||||
runtime_safety_factor: 1.25
|
||||
debounce_count: 3
|
||||
telemetry_timeout_seconds: 90
|
||||
coordination:
|
||||
|
||||
@ -77,7 +77,7 @@ startup:
|
||||
api_wait_seconds: 1200
|
||||
api_poll_seconds: 2
|
||||
shutdown:
|
||||
default_budget_seconds: 300
|
||||
default_budget_seconds: 1380
|
||||
skip_etcd_snapshot: false
|
||||
skip_drain: false
|
||||
drain_parallelism: 6
|
||||
@ -95,7 +95,7 @@ ups:
|
||||
- name: Pyrphoros
|
||||
target: pyrphoros@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.10
|
||||
runtime_safety_factor: 1.25
|
||||
debounce_count: 3
|
||||
telemetry_timeout_seconds: 90
|
||||
coordination:
|
||||
|
||||
@ -3,12 +3,15 @@ Description=Hecate Staged Cluster Bootstrap
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
ConditionPathExists=/etc/hecate/hecate.yaml
|
||||
StartLimitIntervalSec=0
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=root
|
||||
Group=root
|
||||
ExecStart=/usr/local/bin/hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main
|
||||
ExecStart=/usr/local/bin/hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180
|
||||
Restart=on-failure
|
||||
RestartSec=30
|
||||
TimeoutStartSec=1800
|
||||
|
||||
[Install]
|
||||
|
||||
@ -279,7 +279,16 @@ func (o *Orchestrator) effectiveWorkers(ctx context.Context) ([]string, error) {
|
||||
if len(o.cfg.Workers) > 0 {
|
||||
return append([]string{}, o.cfg.Workers...), nil
|
||||
}
|
||||
return o.discoverWorkers(ctx)
|
||||
workers, err := o.discoverWorkers(ctx)
|
||||
if err == nil {
|
||||
return workers, nil
|
||||
}
|
||||
fallback := o.fallbackWorkersFromInventory()
|
||||
if len(fallback) == 0 {
|
||||
return nil, err
|
||||
}
|
||||
o.log.Printf("warning: worker discovery failed via kubernetes API (%v); falling back to inventory workers=%s", err, strings.Join(fallback, ","))
|
||||
return fallback, nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error) {
|
||||
@ -307,6 +316,41 @@ func (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error) {
|
||||
return workers, nil
|
||||
}
|
||||
|
||||
func (o *Orchestrator) fallbackWorkersFromInventory() []string {
|
||||
cp := make(map[string]struct{}, len(o.cfg.ControlPlanes))
|
||||
for _, node := range o.cfg.ControlPlanes {
|
||||
cp[strings.TrimSpace(node)] = struct{}{}
|
||||
}
|
||||
|
||||
candidates := make(map[string]struct{})
|
||||
add := func(node string) {
|
||||
name := strings.TrimSpace(node)
|
||||
if name == "" {
|
||||
return
|
||||
}
|
||||
if _, isCP := cp[name]; isCP {
|
||||
return
|
||||
}
|
||||
candidates[name] = struct{}{}
|
||||
}
|
||||
|
||||
for _, node := range o.cfg.SSHManagedNodes {
|
||||
add(node)
|
||||
}
|
||||
if len(candidates) == 0 {
|
||||
for node := range o.cfg.SSHNodeHosts {
|
||||
add(node)
|
||||
}
|
||||
}
|
||||
|
||||
workers := make([]string, 0, len(candidates))
|
||||
for node := range candidates {
|
||||
workers = append(workers, node)
|
||||
}
|
||||
sort.Strings(workers)
|
||||
return workers
|
||||
}
|
||||
|
||||
func (o *Orchestrator) patchFluxSuspendAll(ctx context.Context, suspend bool) error {
|
||||
patch := fmt.Sprintf(`{"spec":{"suspend":%t}}`, suspend)
|
||||
|
||||
|
||||
@ -1,6 +1,13 @@
|
||||
package cluster
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"scm.bstein.dev/bstein/hecate/internal/config"
|
||||
)
|
||||
|
||||
func TestParseVaultSealed(t *testing.T) {
|
||||
sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`)
|
||||
@ -36,3 +43,42 @@ func TestParseVaultSealedWithKubectlPreamble(t *testing.T) {
|
||||
t.Fatalf("expected sealed=true from payload with preamble")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) {
|
||||
orch := &Orchestrator{
|
||||
cfg: config.Config{
|
||||
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
||||
SSHManagedNodes: []string{
|
||||
"titan-db",
|
||||
"titan-0a",
|
||||
"titan-15",
|
||||
"titan-17",
|
||||
},
|
||||
},
|
||||
log: log.New(os.Stdout, "", 0),
|
||||
}
|
||||
got := orch.fallbackWorkersFromInventory()
|
||||
want := []string{"titan-15", "titan-17", "titan-db"}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("fallback workers mismatch: got=%v want=%v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) {
|
||||
orch := &Orchestrator{
|
||||
cfg: config.Config{
|
||||
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
||||
SSHNodeHosts: map[string]string{
|
||||
"titan-0a": "192.168.22.11",
|
||||
"titan-22": "192.168.22.22",
|
||||
"titan-24": "192.168.22.26",
|
||||
},
|
||||
},
|
||||
log: log.New(os.Stdout, "", 0),
|
||||
}
|
||||
got := orch.fallbackWorkersFromInventory()
|
||||
want := []string{"titan-22", "titan-24"}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("fallback workers mismatch: got=%v want=%v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
@ -58,6 +58,19 @@ resolve_nut_ups_name() {
|
||||
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
|
||||
}
|
||||
|
||||
read_hecate_role() {
|
||||
if [[ ! -f "${CONF_DIR}/hecate.yaml" ]]; then
|
||||
echo "coordinator"
|
||||
return 0
|
||||
fi
|
||||
local role
|
||||
role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/hecate.yaml" 2>/dev/null || true)"
|
||||
if [[ -z "${role}" ]]; then
|
||||
role="coordinator"
|
||||
fi
|
||||
echo "${role}"
|
||||
}
|
||||
|
||||
ensure_apt_packages() {
|
||||
local missing=()
|
||||
for pkg in "$@"; do
|
||||
@ -208,7 +221,9 @@ if [[ "${ENABLE_BOOTSTRAP}" == "1" ]]; then
|
||||
elif [[ "${ENABLE_BOOTSTRAP}" == "0" ]]; then
|
||||
systemctl disable hecate-bootstrap.service >/dev/null 2>&1 || true
|
||||
else
|
||||
echo "[install] leaving hecate-bootstrap.service state unchanged (HECATE_ENABLE_BOOTSTRAP=${ENABLE_BOOTSTRAP})"
|
||||
role="$(read_hecate_role)"
|
||||
systemctl enable hecate-bootstrap.service
|
||||
echo "[install] auto-enabled hecate-bootstrap.service for role=${role}"
|
||||
fi
|
||||
|
||||
if [[ "${START_NOW}" -eq 1 ]]; then
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user