hecate: harden peer bootstrap failover and worker fallback
This commit is contained in:
parent
4b0fffd5e2
commit
985da478c6
@ -23,6 +23,7 @@ Hecate runs outside the cluster under systemd, so it can always orchestrate brin
|
|||||||
|
|
||||||
Key startup guards:
|
Key startup guards:
|
||||||
- Startup is blocked on hosts configured as `coordination.role: peer` (unless `--allow-peer-startup` is used intentionally).
|
- Startup is blocked on hosts configured as `coordination.role: peer` (unless `--allow-peer-startup` is used intentionally).
|
||||||
|
- `--auto-peer-failover` makes peer hosts hand off startup to the coordinator first, then run local startup only if the coordinator is unreachable.
|
||||||
- Startup is blocked while UPS is on battery by default (unless `--allow-on-battery` or `coordination.allow_startup_on_battery: true` is set).
|
- Startup is blocked while UPS is on battery by default (unless `--allow-on-battery` or `coordination.allow_startup_on_battery: true` is set).
|
||||||
- Startup is blocked when a shutdown intent is active (`/var/lib/hecate/intent.json`).
|
- Startup is blocked when a shutdown intent is active (`/var/lib/hecate/intent.json`).
|
||||||
|
|
||||||
@ -45,7 +46,7 @@ The installer is idempotent:
|
|||||||
|
|
||||||
Installer knobs (optional):
|
Installer knobs (optional):
|
||||||
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` on this host.
|
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` on this host.
|
||||||
- `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` preserves current bootstrap enablement state.
|
- `HECATE_ENABLE_BOOTSTRAP=0` disables it; default `auto` enables bootstrap by default.
|
||||||
- `HECATE_MANAGE_NUT=0` skips writing NUT/udev files.
|
- `HECATE_MANAGE_NUT=0` skips writing NUT/udev files.
|
||||||
- `HECATE_NUT_UPS_NAME` (default inferred from `/etc/hecate/hecate.yaml` target, fallback `pyrphoros`)
|
- `HECATE_NUT_UPS_NAME` (default inferred from `/etc/hecate/hecate.yaml` target, fallback `pyrphoros`)
|
||||||
- `HECATE_NUT_VENDOR_ID` / `HECATE_NUT_PRODUCT_ID` (defaults `0764` / `0601`)
|
- `HECATE_NUT_VENDOR_ID` / `HECATE_NUT_PRODUCT_ID` (defaults `0764` / `0601`)
|
||||||
@ -77,6 +78,7 @@ Optional SSH jump/bastion:
|
|||||||
Recommended:
|
Recommended:
|
||||||
- `titan-db` runs Hecate as the shutdown coordinator with UPS `Pyrphoros` (`pyrphoros@localhost`).
|
- `titan-db` runs Hecate as the shutdown coordinator with UPS `Pyrphoros` (`pyrphoros@localhost`).
|
||||||
- `tethys` runs Hecate as a peer with UPS `Statera` (`statera@localhost`) and forwards shutdown triggers to `titan-db`.
|
- `tethys` runs Hecate as a peer with UPS `Statera` (`statera@localhost`) and forwards shutdown triggers to `titan-db`.
|
||||||
|
- The bootstrap unit now runs on both roles; peer role uses auto-failover handoff to coordinator before local fallback startup.
|
||||||
- If forwarding fails, fallback local shutdown can remain enabled.
|
- If forwarding fails, fallback local shutdown can remain enabled.
|
||||||
- Use `coordination.role: coordinator` on `titan-db` and `coordination.role: peer` on `tethys`.
|
- Use `coordination.role: coordinator` on `titan-db` and `coordination.role: peer` on `tethys`.
|
||||||
|
|
||||||
@ -100,7 +102,7 @@ Power metrics:
|
|||||||
- Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
|
- Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
|
||||||
- Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other.
|
- Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other.
|
||||||
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
|
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
|
||||||
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts).
|
- `HECATE_ENABLE_BOOTSTRAP=1` forces bootstrap on, `HECATE_ENABLE_BOOTSTRAP=0` forces it off, and `auto` enables by default.
|
||||||
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
|
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
|
||||||
|
|
||||||
## Disruptive startup drills
|
## Disruptive startup drills
|
||||||
|
|||||||
@ -7,7 +7,9 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
|
"os/exec"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
@ -65,6 +67,8 @@ func runStartup(logger *log.Logger, args []string) error {
|
|||||||
forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume")
|
forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume")
|
||||||
skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies")
|
skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies")
|
||||||
allowPeerStartup := fs.Bool("allow-peer-startup", false, "Allow startup to run on a peer instance")
|
allowPeerStartup := fs.Bool("allow-peer-startup", false, "Allow startup to run on a peer instance")
|
||||||
|
autoPeerFailover := fs.Bool("auto-peer-failover", false, "On peer role, try coordinator bootstrap handoff first and only run local startup as fallback")
|
||||||
|
peerWaitSeconds := fs.Int("peer-wait-seconds", 180, "How long auto peer failover waits for coordinator handoff before local fallback startup")
|
||||||
allowOnBattery := fs.Bool("allow-on-battery", false, "Allow startup when UPS reports on-battery")
|
allowOnBattery := fs.Bool("allow-on-battery", false, "Allow startup when UPS reports on-battery")
|
||||||
reason := fs.String("reason", "manual-startup", "Startup reason for run history")
|
reason := fs.String("reason", "manual-startup", "Startup reason for run history")
|
||||||
_ = fs.Parse(args)
|
_ = fs.Parse(args)
|
||||||
@ -73,9 +77,25 @@ func runStartup(logger *log.Logger, args []string) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
allowPeer := *allowPeerStartup
|
||||||
if *execute {
|
if *execute {
|
||||||
if cfg.Coordination.Role == "peer" && !*allowPeerStartup {
|
if cfg.Coordination.Role == "peer" && !allowPeer {
|
||||||
return fmt.Errorf("startup blocked: this instance is configured as role=peer (use --allow-peer-startup to override)")
|
if *autoPeerFailover {
|
||||||
|
handoffCtx, cancel := context.WithTimeout(context.Background(), time.Duration(maxInt(*peerWaitSeconds, 1))*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
handoff, handoffErr := tryPeerBootstrapHandoff(handoffCtx, cfg, logger)
|
||||||
|
if handoffErr != nil {
|
||||||
|
logger.Printf("warning: peer bootstrap handoff failed: %v", handoffErr)
|
||||||
|
}
|
||||||
|
if handoff {
|
||||||
|
logger.Printf("peer startup handoff complete; skipping local startup")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
logger.Printf("peer startup handoff unavailable; proceeding with local peer startup fallback")
|
||||||
|
allowPeer = true
|
||||||
|
} else {
|
||||||
|
return fmt.Errorf("startup blocked: this instance is configured as role=peer (use --allow-peer-startup to override)")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if cfg.UPS.Enabled && !cfg.Coordination.AllowStartupOnBattery && !*allowOnBattery {
|
if cfg.UPS.Enabled && !cfg.Coordination.AllowStartupOnBattery && !*allowOnBattery {
|
||||||
targets, targetErr := buildUPSTargets(cfg)
|
targets, targetErr := buildUPSTargets(cfg)
|
||||||
@ -270,3 +290,116 @@ Examples:
|
|||||||
hecate status --config /etc/hecate/hecate.yaml
|
hecate status --config /etc/hecate/hecate.yaml
|
||||||
`)
|
`)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func tryPeerBootstrapHandoff(ctx context.Context, cfg config.Config, logger *log.Logger) (bool, error) {
|
||||||
|
coordinator := strings.TrimSpace(cfg.Coordination.ForwardShutdownHost)
|
||||||
|
if coordinator == "" {
|
||||||
|
return false, fmt.Errorf("coordination.forward_shutdown_host is empty for peer role")
|
||||||
|
}
|
||||||
|
user := strings.TrimSpace(cfg.Coordination.ForwardShutdownUser)
|
||||||
|
if user == "" {
|
||||||
|
if override, ok := cfg.SSHNodeUsers[coordinator]; ok && strings.TrimSpace(override) != "" {
|
||||||
|
user = strings.TrimSpace(override)
|
||||||
|
} else {
|
||||||
|
user = strings.TrimSpace(cfg.SSHUser)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
host := coordinator
|
||||||
|
if mapped, ok := cfg.SSHNodeHosts[coordinator]; ok && strings.TrimSpace(mapped) != "" {
|
||||||
|
host = strings.TrimSpace(mapped)
|
||||||
|
}
|
||||||
|
target := host
|
||||||
|
if user != "" {
|
||||||
|
target = user + "@" + host
|
||||||
|
}
|
||||||
|
|
||||||
|
args := []string{
|
||||||
|
"-o", "BatchMode=yes",
|
||||||
|
"-o", "ConnectTimeout=8",
|
||||||
|
"-o", "StrictHostKeyChecking=accept-new",
|
||||||
|
}
|
||||||
|
if cfgPath := resolveSSHConfigFile(cfg); cfgPath != "" {
|
||||||
|
args = append(args, "-F", cfgPath)
|
||||||
|
}
|
||||||
|
if idPath := resolveSSHIdentityFile(cfg); idPath != "" {
|
||||||
|
args = append(args, "-i", idPath)
|
||||||
|
}
|
||||||
|
if cfg.SSHPort > 0 {
|
||||||
|
args = append(args, "-p", strconv.Itoa(cfg.SSHPort))
|
||||||
|
}
|
||||||
|
if cfg.SSHJumpHost != "" {
|
||||||
|
jump := cfg.SSHJumpHost
|
||||||
|
if cfg.SSHJumpUser != "" {
|
||||||
|
jump = cfg.SSHJumpUser + "@" + jump
|
||||||
|
}
|
||||||
|
if cfg.SSHPort > 0 && !strings.Contains(jump, ":") {
|
||||||
|
jump = fmt.Sprintf("%s:%d", jump, cfg.SSHPort)
|
||||||
|
}
|
||||||
|
args = append(args, "-J", jump)
|
||||||
|
}
|
||||||
|
|
||||||
|
remote := "sudo -n systemctl start hecate-bootstrap.service"
|
||||||
|
attempt := 1
|
||||||
|
for {
|
||||||
|
cmdArgs := append(append([]string{}, args...), target, remote)
|
||||||
|
cmd := exec.CommandContext(ctx, "ssh", cmdArgs...)
|
||||||
|
out, err := cmd.CombinedOutput()
|
||||||
|
if err == nil {
|
||||||
|
logger.Printf("peer bootstrap handoff succeeded on %s (attempt=%d)", coordinator, attempt)
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
trimmed := strings.TrimSpace(string(out))
|
||||||
|
if trimmed == "" {
|
||||||
|
logger.Printf("peer bootstrap handoff attempt %d failed for %s: %v", attempt, coordinator, err)
|
||||||
|
} else {
|
||||||
|
logger.Printf("peer bootstrap handoff attempt %d failed for %s: %v: %s", attempt, coordinator, err, trimmed)
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return false, fmt.Errorf("coordinator handoff timeout for %s: %w", coordinator, ctx.Err())
|
||||||
|
case <-time.After(5 * time.Second):
|
||||||
|
attempt++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveSSHConfigFile(cfg config.Config) string {
|
||||||
|
if strings.TrimSpace(cfg.SSHConfigFile) != "" {
|
||||||
|
return strings.TrimSpace(cfg.SSHConfigFile)
|
||||||
|
}
|
||||||
|
candidates := []string{
|
||||||
|
"/home/atlas/.ssh/config",
|
||||||
|
"/home/tethys/.ssh/config",
|
||||||
|
}
|
||||||
|
for _, p := range candidates {
|
||||||
|
if stat, err := os.Stat(p); err == nil && !stat.IsDir() {
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveSSHIdentityFile(cfg config.Config) string {
|
||||||
|
if strings.TrimSpace(cfg.SSHIdentityFile) != "" {
|
||||||
|
return strings.TrimSpace(cfg.SSHIdentityFile)
|
||||||
|
}
|
||||||
|
candidates := []string{
|
||||||
|
"/home/atlas/.ssh/id_ed25519",
|
||||||
|
"/home/tethys/.ssh/id_ed25519",
|
||||||
|
}
|
||||||
|
for _, p := range candidates {
|
||||||
|
if stat, err := os.Stat(p); err == nil && !stat.IsDir() {
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func maxInt(a, b int) int {
|
||||||
|
if a > b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|||||||
@ -43,7 +43,7 @@ startup:
|
|||||||
api_wait_seconds: 1200
|
api_wait_seconds: 1200
|
||||||
api_poll_seconds: 2
|
api_poll_seconds: 2
|
||||||
shutdown:
|
shutdown:
|
||||||
default_budget_seconds: 300
|
default_budget_seconds: 1380
|
||||||
skip_etcd_snapshot: false
|
skip_etcd_snapshot: false
|
||||||
skip_drain: false
|
skip_drain: false
|
||||||
drain_parallelism: 6
|
drain_parallelism: 6
|
||||||
@ -62,7 +62,7 @@ ups:
|
|||||||
- name: Pyrphoros
|
- name: Pyrphoros
|
||||||
target: pyrphoros@localhost
|
target: pyrphoros@localhost
|
||||||
poll_seconds: 5
|
poll_seconds: 5
|
||||||
runtime_safety_factor: 1.10
|
runtime_safety_factor: 1.25
|
||||||
debounce_count: 3
|
debounce_count: 3
|
||||||
telemetry_timeout_seconds: 90
|
telemetry_timeout_seconds: 90
|
||||||
coordination:
|
coordination:
|
||||||
|
|||||||
@ -32,6 +32,15 @@ ssh_node_users:
|
|||||||
titan-24: tethys
|
titan-24: tethys
|
||||||
ssh_managed_nodes:
|
ssh_managed_nodes:
|
||||||
- titan-db
|
- titan-db
|
||||||
|
- titan-0a
|
||||||
|
- titan-0b
|
||||||
|
- titan-0c
|
||||||
|
- titan-12
|
||||||
|
- titan-14
|
||||||
|
- titan-15
|
||||||
|
- titan-17
|
||||||
|
- titan-18
|
||||||
|
- titan-22
|
||||||
- titan-24
|
- titan-24
|
||||||
ssh_jump_host: ""
|
ssh_jump_host: ""
|
||||||
ssh_jump_user: ""
|
ssh_jump_user: ""
|
||||||
@ -60,7 +69,7 @@ startup:
|
|||||||
api_wait_seconds: 1200
|
api_wait_seconds: 1200
|
||||||
api_poll_seconds: 2
|
api_poll_seconds: 2
|
||||||
shutdown:
|
shutdown:
|
||||||
default_budget_seconds: 300
|
default_budget_seconds: 1380
|
||||||
skip_etcd_snapshot: false
|
skip_etcd_snapshot: false
|
||||||
skip_drain: false
|
skip_drain: false
|
||||||
drain_parallelism: 6
|
drain_parallelism: 6
|
||||||
@ -77,7 +86,7 @@ ups:
|
|||||||
- name: Statera
|
- name: Statera
|
||||||
target: statera@localhost
|
target: statera@localhost
|
||||||
poll_seconds: 5
|
poll_seconds: 5
|
||||||
runtime_safety_factor: 1.10
|
runtime_safety_factor: 1.25
|
||||||
debounce_count: 3
|
debounce_count: 3
|
||||||
telemetry_timeout_seconds: 90
|
telemetry_timeout_seconds: 90
|
||||||
coordination:
|
coordination:
|
||||||
|
|||||||
@ -77,7 +77,7 @@ startup:
|
|||||||
api_wait_seconds: 1200
|
api_wait_seconds: 1200
|
||||||
api_poll_seconds: 2
|
api_poll_seconds: 2
|
||||||
shutdown:
|
shutdown:
|
||||||
default_budget_seconds: 300
|
default_budget_seconds: 1380
|
||||||
skip_etcd_snapshot: false
|
skip_etcd_snapshot: false
|
||||||
skip_drain: false
|
skip_drain: false
|
||||||
drain_parallelism: 6
|
drain_parallelism: 6
|
||||||
@ -95,7 +95,7 @@ ups:
|
|||||||
- name: Pyrphoros
|
- name: Pyrphoros
|
||||||
target: pyrphoros@localhost
|
target: pyrphoros@localhost
|
||||||
poll_seconds: 5
|
poll_seconds: 5
|
||||||
runtime_safety_factor: 1.10
|
runtime_safety_factor: 1.25
|
||||||
debounce_count: 3
|
debounce_count: 3
|
||||||
telemetry_timeout_seconds: 90
|
telemetry_timeout_seconds: 90
|
||||||
coordination:
|
coordination:
|
||||||
|
|||||||
@ -3,12 +3,15 @@ Description=Hecate Staged Cluster Bootstrap
|
|||||||
Wants=network-online.target
|
Wants=network-online.target
|
||||||
After=network-online.target
|
After=network-online.target
|
||||||
ConditionPathExists=/etc/hecate/hecate.yaml
|
ConditionPathExists=/etc/hecate/hecate.yaml
|
||||||
|
StartLimitIntervalSec=0
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
User=root
|
User=root
|
||||||
Group=root
|
Group=root
|
||||||
ExecStart=/usr/local/bin/hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main
|
ExecStart=/usr/local/bin/hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main --auto-peer-failover --peer-wait-seconds 180
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=30
|
||||||
TimeoutStartSec=1800
|
TimeoutStartSec=1800
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
|
|||||||
@ -279,7 +279,16 @@ func (o *Orchestrator) effectiveWorkers(ctx context.Context) ([]string, error) {
|
|||||||
if len(o.cfg.Workers) > 0 {
|
if len(o.cfg.Workers) > 0 {
|
||||||
return append([]string{}, o.cfg.Workers...), nil
|
return append([]string{}, o.cfg.Workers...), nil
|
||||||
}
|
}
|
||||||
return o.discoverWorkers(ctx)
|
workers, err := o.discoverWorkers(ctx)
|
||||||
|
if err == nil {
|
||||||
|
return workers, nil
|
||||||
|
}
|
||||||
|
fallback := o.fallbackWorkersFromInventory()
|
||||||
|
if len(fallback) == 0 {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
o.log.Printf("warning: worker discovery failed via kubernetes API (%v); falling back to inventory workers=%s", err, strings.Join(fallback, ","))
|
||||||
|
return fallback, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error) {
|
func (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error) {
|
||||||
@ -307,6 +316,41 @@ func (o *Orchestrator) discoverWorkers(ctx context.Context) ([]string, error) {
|
|||||||
return workers, nil
|
return workers, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) fallbackWorkersFromInventory() []string {
|
||||||
|
cp := make(map[string]struct{}, len(o.cfg.ControlPlanes))
|
||||||
|
for _, node := range o.cfg.ControlPlanes {
|
||||||
|
cp[strings.TrimSpace(node)] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
candidates := make(map[string]struct{})
|
||||||
|
add := func(node string) {
|
||||||
|
name := strings.TrimSpace(node)
|
||||||
|
if name == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if _, isCP := cp[name]; isCP {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
candidates[name] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, node := range o.cfg.SSHManagedNodes {
|
||||||
|
add(node)
|
||||||
|
}
|
||||||
|
if len(candidates) == 0 {
|
||||||
|
for node := range o.cfg.SSHNodeHosts {
|
||||||
|
add(node)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
workers := make([]string, 0, len(candidates))
|
||||||
|
for node := range candidates {
|
||||||
|
workers = append(workers, node)
|
||||||
|
}
|
||||||
|
sort.Strings(workers)
|
||||||
|
return workers
|
||||||
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) patchFluxSuspendAll(ctx context.Context, suspend bool) error {
|
func (o *Orchestrator) patchFluxSuspendAll(ctx context.Context, suspend bool) error {
|
||||||
patch := fmt.Sprintf(`{"spec":{"suspend":%t}}`, suspend)
|
patch := fmt.Sprintf(`{"spec":{"suspend":%t}}`, suspend)
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,13 @@
|
|||||||
package cluster
|
package cluster
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"scm.bstein.dev/bstein/hecate/internal/config"
|
||||||
|
)
|
||||||
|
|
||||||
func TestParseVaultSealed(t *testing.T) {
|
func TestParseVaultSealed(t *testing.T) {
|
||||||
sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`)
|
sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`)
|
||||||
@ -36,3 +43,42 @@ func TestParseVaultSealedWithKubectlPreamble(t *testing.T) {
|
|||||||
t.Fatalf("expected sealed=true from payload with preamble")
|
t.Fatalf("expected sealed=true from payload with preamble")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) {
|
||||||
|
orch := &Orchestrator{
|
||||||
|
cfg: config.Config{
|
||||||
|
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
||||||
|
SSHManagedNodes: []string{
|
||||||
|
"titan-db",
|
||||||
|
"titan-0a",
|
||||||
|
"titan-15",
|
||||||
|
"titan-17",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
log: log.New(os.Stdout, "", 0),
|
||||||
|
}
|
||||||
|
got := orch.fallbackWorkersFromInventory()
|
||||||
|
want := []string{"titan-15", "titan-17", "titan-db"}
|
||||||
|
if !reflect.DeepEqual(got, want) {
|
||||||
|
t.Fatalf("fallback workers mismatch: got=%v want=%v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) {
|
||||||
|
orch := &Orchestrator{
|
||||||
|
cfg: config.Config{
|
||||||
|
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
||||||
|
SSHNodeHosts: map[string]string{
|
||||||
|
"titan-0a": "192.168.22.11",
|
||||||
|
"titan-22": "192.168.22.22",
|
||||||
|
"titan-24": "192.168.22.26",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
log: log.New(os.Stdout, "", 0),
|
||||||
|
}
|
||||||
|
got := orch.fallbackWorkersFromInventory()
|
||||||
|
want := []string{"titan-22", "titan-24"}
|
||||||
|
if !reflect.DeepEqual(got, want) {
|
||||||
|
t.Fatalf("fallback workers mismatch: got=%v want=%v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -58,6 +58,19 @@ resolve_nut_ups_name() {
|
|||||||
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
|
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
read_hecate_role() {
|
||||||
|
if [[ ! -f "${CONF_DIR}/hecate.yaml" ]]; then
|
||||||
|
echo "coordinator"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
local role
|
||||||
|
role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/hecate.yaml" 2>/dev/null || true)"
|
||||||
|
if [[ -z "${role}" ]]; then
|
||||||
|
role="coordinator"
|
||||||
|
fi
|
||||||
|
echo "${role}"
|
||||||
|
}
|
||||||
|
|
||||||
ensure_apt_packages() {
|
ensure_apt_packages() {
|
||||||
local missing=()
|
local missing=()
|
||||||
for pkg in "$@"; do
|
for pkg in "$@"; do
|
||||||
@ -208,7 +221,9 @@ if [[ "${ENABLE_BOOTSTRAP}" == "1" ]]; then
|
|||||||
elif [[ "${ENABLE_BOOTSTRAP}" == "0" ]]; then
|
elif [[ "${ENABLE_BOOTSTRAP}" == "0" ]]; then
|
||||||
systemctl disable hecate-bootstrap.service >/dev/null 2>&1 || true
|
systemctl disable hecate-bootstrap.service >/dev/null 2>&1 || true
|
||||||
else
|
else
|
||||||
echo "[install] leaving hecate-bootstrap.service state unchanged (HECATE_ENABLE_BOOTSTRAP=${ENABLE_BOOTSTRAP})"
|
role="$(read_hecate_role)"
|
||||||
|
systemctl enable hecate-bootstrap.service
|
||||||
|
echo "[install] auto-enabled hecate-bootstrap.service for role=${role}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "${START_NOW}" -eq 1 ]]; then
|
if [[ "${START_NOW}" -eq 1 ]]; then
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user