hecate(startup): add coordinated intent guards and resilient recovery ssh
This commit is contained in:
parent
4c985000a8
commit
3af6fe9f6f
13
README.md
13
README.md
@ -21,6 +21,11 @@ Hecate runs outside the cluster under systemd, so it can always orchestrate brin
|
|||||||
- `hecate daemon --config /etc/hecate/hecate.yaml`
|
- `hecate daemon --config /etc/hecate/hecate.yaml`
|
||||||
- `hecate status --config /etc/hecate/hecate.yaml`
|
- `hecate status --config /etc/hecate/hecate.yaml`
|
||||||
|
|
||||||
|
Key startup guards:
|
||||||
|
- Startup is blocked on hosts configured as `coordination.role: peer` (unless `--allow-peer-startup` is used intentionally).
|
||||||
|
- Startup is blocked while UPS is on battery by default (unless `--allow-on-battery` or `coordination.allow_startup_on_battery: true` is set).
|
||||||
|
- Startup is blocked when a shutdown intent is active (`/var/lib/hecate/intent.json`).
|
||||||
|
|
||||||
## Manual install on titan-db
|
## Manual install on titan-db
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -61,12 +66,19 @@ sudo systemctl start hecate-bootstrap.service
|
|||||||
- `systemctl start/stop k3s-agent`
|
- `systemctl start/stop k3s-agent`
|
||||||
- UPS telemetry available via NUT (`upsc`)
|
- UPS telemetry available via NUT (`upsc`)
|
||||||
|
|
||||||
|
Optional SSH jump/bastion:
|
||||||
|
- Set `ssh_jump_host` (and optional `ssh_jump_user`) to route node SSH through a jump host like `titan-jh`; Hecate now falls back to direct SSH automatically if jump routing is unavailable.
|
||||||
|
- Set `ssh_port`, `ssh_identity_file`, and `ssh_node_hosts` so root-run systemd actions can actually reach node SSH daemons during cold-start recovery.
|
||||||
|
- Use `ssh_node_users` for per-node username overrides (for example `titan-24: tethys`).
|
||||||
|
- Use `ssh_managed_nodes` to limit host-level SSH start/stop actions to nodes Hecate can actually authenticate to.
|
||||||
|
|
||||||
## Multi-UPS topology
|
## Multi-UPS topology
|
||||||
|
|
||||||
Recommended:
|
Recommended:
|
||||||
- `titan-db` runs Hecate as the shutdown coordinator with UPS `Pyrphoros` (`pyrphoros@localhost`).
|
- `titan-db` runs Hecate as the shutdown coordinator with UPS `Pyrphoros` (`pyrphoros@localhost`).
|
||||||
- `tethys` runs Hecate as a peer with UPS `Statera` (`statera@localhost`) and forwards shutdown triggers to `titan-db`.
|
- `tethys` runs Hecate as a peer with UPS `Statera` (`statera@localhost`) and forwards shutdown triggers to `titan-db`.
|
||||||
- If forwarding fails, fallback local shutdown can remain enabled.
|
- If forwarding fails, fallback local shutdown can remain enabled.
|
||||||
|
- Use `coordination.role: coordinator` on `titan-db` and `coordination.role: peer` on `tethys`.
|
||||||
|
|
||||||
## Config
|
## Config
|
||||||
|
|
||||||
@ -86,6 +98,7 @@ Power metrics:
|
|||||||
## Notes
|
## Notes
|
||||||
|
|
||||||
- Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
|
- Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
|
||||||
|
- Hecate tracks intent in `/var/lib/hecate/intent.json` (`normal`, `startup_in_progress`, `shutting_down`, `shutdown_complete`) to avoid startup/shutdown fighting each other.
|
||||||
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
|
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
|
||||||
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts).
|
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts).
|
||||||
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
|
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -63,19 +64,38 @@ func runStartup(logger *log.Logger, args []string) error {
|
|||||||
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
|
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
|
||||||
forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume")
|
forceBranch := fs.String("force-flux-branch", "", "Patch Flux source branch before resume")
|
||||||
skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies")
|
skipLocalBootstrap := fs.Bool("skip-local-bootstrap", false, "Skip local fallback bootstrap applies")
|
||||||
|
allowPeerStartup := fs.Bool("allow-peer-startup", false, "Allow startup to run on a peer instance")
|
||||||
|
allowOnBattery := fs.Bool("allow-on-battery", false, "Allow startup when UPS reports on-battery")
|
||||||
|
reason := fs.String("reason", "manual-startup", "Startup reason for run history")
|
||||||
_ = fs.Parse(args)
|
_ = fs.Parse(args)
|
||||||
|
|
||||||
_, orch, err := buildOrchestrator(logger, *configPath, !*execute)
|
cfg, orch, err := buildOrchestrator(logger, *configPath, !*execute)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
if *execute {
|
||||||
|
if cfg.Coordination.Role == "peer" && !*allowPeerStartup {
|
||||||
|
return fmt.Errorf("startup blocked: this instance is configured as role=peer (use --allow-peer-startup to override)")
|
||||||
|
}
|
||||||
|
if cfg.UPS.Enabled && !cfg.Coordination.AllowStartupOnBattery && !*allowOnBattery {
|
||||||
|
targets, targetErr := buildUPSTargets(cfg)
|
||||||
|
if targetErr != nil {
|
||||||
|
return targetErr
|
||||||
|
}
|
||||||
|
checkCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
if err := ensureStartupPowerSafe(checkCtx, targets); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
return orch.Startup(ctx, cluster.StartupOptions{
|
return orch.Startup(ctx, cluster.StartupOptions{
|
||||||
ForceFluxBranch: *forceBranch,
|
ForceFluxBranch: *forceBranch,
|
||||||
SkipLocalBootstrap: *skipLocalBootstrap,
|
SkipLocalBootstrap: *skipLocalBootstrap,
|
||||||
Reason: "manual-startup",
|
Reason: *reason,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -115,34 +135,9 @@ func runDaemon(logger *log.Logger, args []string) error {
|
|||||||
if !cfg.UPS.Enabled {
|
if !cfg.UPS.Enabled {
|
||||||
return fmt.Errorf("UPS monitoring is disabled in config")
|
return fmt.Errorf("UPS monitoring is disabled in config")
|
||||||
}
|
}
|
||||||
targets := make([]service.Target, 0, len(cfg.UPS.Targets)+1)
|
targets, err := buildUPSTargets(cfg)
|
||||||
switch cfg.UPS.Provider {
|
if err != nil {
|
||||||
case "nut":
|
return err
|
||||||
if len(cfg.UPS.Targets) == 0 {
|
|
||||||
target := cfg.UPS.Target
|
|
||||||
if target == "" {
|
|
||||||
return fmt.Errorf("ups target must be set")
|
|
||||||
}
|
|
||||||
targets = append(targets, service.Target{
|
|
||||||
Name: "primary",
|
|
||||||
Target: target,
|
|
||||||
Provider: ups.NewNUTProvider(target),
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
for idx, t := range cfg.UPS.Targets {
|
|
||||||
name := t.Name
|
|
||||||
if name == "" {
|
|
||||||
name = fmt.Sprintf("target-%d", idx+1)
|
|
||||||
}
|
|
||||||
targets = append(targets, service.Target{
|
|
||||||
Name: name,
|
|
||||||
Target: t.Target,
|
|
||||||
Provider: ups.NewNUTProvider(t.Target),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
return fmt.Errorf("unsupported UPS provider: %s", cfg.UPS.Provider)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
d := service.NewDaemon(cfg, orch, targets, logger)
|
d := service.NewDaemon(cfg, orch, targets, logger)
|
||||||
@ -175,10 +170,69 @@ func runStatus(logger *log.Logger, args []string) error {
|
|||||||
logger.Printf("expected_flux_branch=%s", cfg.ExpectedFluxBranch)
|
logger.Printf("expected_flux_branch=%s", cfg.ExpectedFluxBranch)
|
||||||
logger.Printf("control_planes=%v", cfg.ControlPlanes)
|
logger.Printf("control_planes=%v", cfg.ControlPlanes)
|
||||||
logger.Printf("estimated_shutdown_budget_seconds=%d", orch.EstimatedShutdownSeconds())
|
logger.Printf("estimated_shutdown_budget_seconds=%d", orch.EstimatedShutdownSeconds())
|
||||||
|
intent, intentErr := state.ReadIntent(cfg.State.IntentPath)
|
||||||
|
if intentErr != nil {
|
||||||
|
logger.Printf("intent_read_error=%v", intentErr)
|
||||||
|
} else if intent.State == "" {
|
||||||
|
logger.Printf("intent=none")
|
||||||
|
} else {
|
||||||
|
logger.Printf("intent=%s reason=%q source=%s updated_at=%s",
|
||||||
|
intent.State, intent.Reason, intent.Source, intent.UpdatedAt.Format(time.RFC3339))
|
||||||
|
}
|
||||||
logger.Printf("last_run=%s", last)
|
logger.Printf("last_run=%s", last)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func buildUPSTargets(cfg config.Config) ([]service.Target, error) {
|
||||||
|
targets := make([]service.Target, 0, len(cfg.UPS.Targets)+1)
|
||||||
|
switch cfg.UPS.Provider {
|
||||||
|
case "nut":
|
||||||
|
if len(cfg.UPS.Targets) == 0 {
|
||||||
|
target := cfg.UPS.Target
|
||||||
|
if target == "" {
|
||||||
|
return nil, fmt.Errorf("ups target must be set")
|
||||||
|
}
|
||||||
|
targets = append(targets, service.Target{
|
||||||
|
Name: "primary",
|
||||||
|
Target: target,
|
||||||
|
Provider: ups.NewNUTProvider(target),
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
for idx, t := range cfg.UPS.Targets {
|
||||||
|
name := t.Name
|
||||||
|
if name == "" {
|
||||||
|
name = fmt.Sprintf("target-%d", idx+1)
|
||||||
|
}
|
||||||
|
targets = append(targets, service.Target{
|
||||||
|
Name: name,
|
||||||
|
Target: t.Target,
|
||||||
|
Provider: ups.NewNUTProvider(t.Target),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("unsupported UPS provider: %s", cfg.UPS.Provider)
|
||||||
|
}
|
||||||
|
return targets, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureStartupPowerSafe(ctx context.Context, targets []service.Target) error {
|
||||||
|
onBatteryTargets := []string{}
|
||||||
|
for _, t := range targets {
|
||||||
|
sample, err := t.Provider.Read(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("startup blocked: unable to verify UPS target %s (%s): %w", t.Name, t.Target, err)
|
||||||
|
}
|
||||||
|
if sample.OnBattery {
|
||||||
|
onBatteryTargets = append(onBatteryTargets, fmt.Sprintf("%s(status=%s runtime_s=%d)", t.Name, sample.RawStatus, sample.RuntimeSeconds))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(onBatteryTargets) > 0 {
|
||||||
|
return fmt.Errorf("startup blocked: UPS is on battery for %s", strings.Join(onBatteryTargets, ", "))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func buildOrchestrator(logger *log.Logger, cfgPath string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
|
func buildOrchestrator(logger *log.Logger, cfgPath string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
|
||||||
cfg, err := config.Load(cfgPath)
|
cfg, err := config.Load(cfgPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@ -1,6 +1,13 @@
|
|||||||
# /etc/hecate/hecate.yaml
|
# /etc/hecate/hecate.yaml
|
||||||
kubeconfig: /home/atlas/.kube/config
|
kubeconfig: /etc/hecate/kubeconfig
|
||||||
ssh_user: atlas
|
ssh_user: atlas
|
||||||
|
ssh_port: 2277
|
||||||
|
ssh_identity_file: /home/atlas/.ssh/id_ed25519
|
||||||
|
ssh_node_hosts: {}
|
||||||
|
ssh_node_users: {}
|
||||||
|
ssh_managed_nodes: []
|
||||||
|
ssh_jump_host: ""
|
||||||
|
ssh_jump_user: ""
|
||||||
iac_repo_path: /opt/titan-iac
|
iac_repo_path: /opt/titan-iac
|
||||||
expected_flux_branch: main
|
expected_flux_branch: main
|
||||||
control_planes:
|
control_planes:
|
||||||
@ -10,7 +17,7 @@ control_planes:
|
|||||||
workers: []
|
workers: []
|
||||||
local_bootstrap_paths:
|
local_bootstrap_paths:
|
||||||
- infrastructure/core
|
- infrastructure/core
|
||||||
- infrastructure/flux-system
|
- clusters/atlas/flux-system
|
||||||
- infrastructure/sources/helm
|
- infrastructure/sources/helm
|
||||||
- infrastructure/metallb
|
- infrastructure/metallb
|
||||||
- infrastructure/traefik
|
- infrastructure/traefik
|
||||||
@ -31,6 +38,9 @@ excluded_namespaces:
|
|||||||
- vault
|
- vault
|
||||||
- postgres
|
- postgres
|
||||||
- maintenance
|
- maintenance
|
||||||
|
startup:
|
||||||
|
api_wait_seconds: 1200
|
||||||
|
api_poll_seconds: 2
|
||||||
shutdown:
|
shutdown:
|
||||||
default_budget_seconds: 300
|
default_budget_seconds: 300
|
||||||
skip_etcd_snapshot: false
|
skip_etcd_snapshot: false
|
||||||
@ -57,6 +67,8 @@ coordination:
|
|||||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||||
fallback_local_shutdown: true
|
fallback_local_shutdown: true
|
||||||
command_timeout_seconds: 25
|
command_timeout_seconds: 25
|
||||||
|
role: coordinator
|
||||||
|
allow_startup_on_battery: false
|
||||||
metrics:
|
metrics:
|
||||||
enabled: true
|
enabled: true
|
||||||
bind_addr: 0.0.0.0:9560
|
bind_addr: 0.0.0.0:9560
|
||||||
@ -65,3 +77,4 @@ state:
|
|||||||
dir: /var/lib/hecate
|
dir: /var/lib/hecate
|
||||||
run_history_path: /var/lib/hecate/runs.json
|
run_history_path: /var/lib/hecate/runs.json
|
||||||
lock_path: /var/lib/hecate/hecate.lock
|
lock_path: /var/lib/hecate/hecate.lock
|
||||||
|
intent_path: /var/lib/hecate/intent.json
|
||||||
|
|||||||
@ -1,6 +1,39 @@
|
|||||||
# /etc/hecate/hecate.yaml for titan-24 (tethys forwarder)
|
# /etc/hecate/hecate.yaml for titan-24 (tethys forwarder)
|
||||||
kubeconfig: /home/tethys/.kube/config
|
kubeconfig: /etc/hecate/kubeconfig
|
||||||
ssh_user: atlas
|
ssh_user: atlas
|
||||||
|
ssh_port: 2277
|
||||||
|
ssh_identity_file: /home/tethys/.ssh/id_ed25519
|
||||||
|
ssh_node_hosts:
|
||||||
|
titan-db: 192.168.22.7
|
||||||
|
titan-0a: 192.168.22.11
|
||||||
|
titan-0b: 192.168.22.12
|
||||||
|
titan-0c: 192.168.22.13
|
||||||
|
titan-04: 192.168.22.30
|
||||||
|
titan-05: 192.168.22.31
|
||||||
|
titan-06: 192.168.22.32
|
||||||
|
titan-07: 192.168.22.33
|
||||||
|
titan-08: 192.168.22.34
|
||||||
|
titan-09: 192.168.22.35
|
||||||
|
titan-10: 192.168.22.36
|
||||||
|
titan-11: 192.168.22.37
|
||||||
|
titan-12: 192.168.22.40
|
||||||
|
titan-13: 192.168.22.41
|
||||||
|
titan-14: 192.168.22.42
|
||||||
|
titan-15: 192.168.22.43
|
||||||
|
titan-17: 192.168.22.45
|
||||||
|
titan-18: 192.168.22.46
|
||||||
|
titan-19: 192.168.22.47
|
||||||
|
titan-20: 192.168.22.20
|
||||||
|
titan-21: 192.168.22.21
|
||||||
|
titan-22: 192.168.22.22
|
||||||
|
titan-24: 192.168.22.26
|
||||||
|
ssh_node_users:
|
||||||
|
titan-24: tethys
|
||||||
|
ssh_managed_nodes:
|
||||||
|
- titan-db
|
||||||
|
- titan-24
|
||||||
|
ssh_jump_host: ""
|
||||||
|
ssh_jump_user: ""
|
||||||
iac_repo_path: /opt/titan-iac
|
iac_repo_path: /opt/titan-iac
|
||||||
expected_flux_branch: main
|
expected_flux_branch: main
|
||||||
control_planes:
|
control_planes:
|
||||||
@ -22,6 +55,9 @@ excluded_namespaces:
|
|||||||
- vault
|
- vault
|
||||||
- postgres
|
- postgres
|
||||||
- maintenance
|
- maintenance
|
||||||
|
startup:
|
||||||
|
api_wait_seconds: 1200
|
||||||
|
api_poll_seconds: 2
|
||||||
shutdown:
|
shutdown:
|
||||||
default_budget_seconds: 300
|
default_budget_seconds: 300
|
||||||
skip_etcd_snapshot: false
|
skip_etcd_snapshot: false
|
||||||
@ -46,6 +82,8 @@ coordination:
|
|||||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||||
fallback_local_shutdown: false
|
fallback_local_shutdown: false
|
||||||
command_timeout_seconds: 25
|
command_timeout_seconds: 25
|
||||||
|
role: peer
|
||||||
|
allow_startup_on_battery: false
|
||||||
metrics:
|
metrics:
|
||||||
enabled: true
|
enabled: true
|
||||||
bind_addr: 0.0.0.0:9560
|
bind_addr: 0.0.0.0:9560
|
||||||
@ -54,3 +92,4 @@ state:
|
|||||||
dir: /var/lib/hecate
|
dir: /var/lib/hecate
|
||||||
run_history_path: /var/lib/hecate/runs.json
|
run_history_path: /var/lib/hecate/runs.json
|
||||||
lock_path: /var/lib/hecate/hecate.lock
|
lock_path: /var/lib/hecate/hecate.lock
|
||||||
|
intent_path: /var/lib/hecate/intent.json
|
||||||
|
|||||||
@ -1,6 +1,47 @@
|
|||||||
# /etc/hecate/hecate.yaml for titan-db (coordinator)
|
# /etc/hecate/hecate.yaml for titan-db (coordinator)
|
||||||
kubeconfig: /home/atlas/.kube/config
|
kubeconfig: /etc/hecate/kubeconfig
|
||||||
ssh_user: atlas
|
ssh_user: atlas
|
||||||
|
ssh_port: 2277
|
||||||
|
ssh_identity_file: /home/atlas/.ssh/id_ed25519
|
||||||
|
ssh_node_hosts:
|
||||||
|
titan-db: 192.168.22.7
|
||||||
|
titan-0a: 192.168.22.11
|
||||||
|
titan-0b: 192.168.22.12
|
||||||
|
titan-0c: 192.168.22.13
|
||||||
|
titan-04: 192.168.22.30
|
||||||
|
titan-05: 192.168.22.31
|
||||||
|
titan-06: 192.168.22.32
|
||||||
|
titan-07: 192.168.22.33
|
||||||
|
titan-08: 192.168.22.34
|
||||||
|
titan-09: 192.168.22.35
|
||||||
|
titan-10: 192.168.22.36
|
||||||
|
titan-11: 192.168.22.37
|
||||||
|
titan-12: 192.168.22.40
|
||||||
|
titan-13: 192.168.22.41
|
||||||
|
titan-14: 192.168.22.42
|
||||||
|
titan-15: 192.168.22.43
|
||||||
|
titan-17: 192.168.22.45
|
||||||
|
titan-18: 192.168.22.46
|
||||||
|
titan-19: 192.168.22.47
|
||||||
|
titan-20: 192.168.22.20
|
||||||
|
titan-21: 192.168.22.21
|
||||||
|
titan-22: 192.168.22.22
|
||||||
|
titan-24: 192.168.22.26
|
||||||
|
ssh_node_users:
|
||||||
|
titan-24: tethys
|
||||||
|
ssh_managed_nodes:
|
||||||
|
- titan-db
|
||||||
|
- titan-0a
|
||||||
|
- titan-0b
|
||||||
|
- titan-0c
|
||||||
|
- titan-12
|
||||||
|
- titan-14
|
||||||
|
- titan-15
|
||||||
|
- titan-17
|
||||||
|
- titan-18
|
||||||
|
- titan-22
|
||||||
|
ssh_jump_host: ""
|
||||||
|
ssh_jump_user: ""
|
||||||
iac_repo_path: /opt/titan-iac
|
iac_repo_path: /opt/titan-iac
|
||||||
expected_flux_branch: main
|
expected_flux_branch: main
|
||||||
control_planes:
|
control_planes:
|
||||||
@ -10,7 +51,7 @@ control_planes:
|
|||||||
workers: []
|
workers: []
|
||||||
local_bootstrap_paths:
|
local_bootstrap_paths:
|
||||||
- infrastructure/core
|
- infrastructure/core
|
||||||
- infrastructure/flux-system
|
- clusters/atlas/flux-system
|
||||||
- infrastructure/sources/helm
|
- infrastructure/sources/helm
|
||||||
- infrastructure/metallb
|
- infrastructure/metallb
|
||||||
- infrastructure/traefik
|
- infrastructure/traefik
|
||||||
@ -31,6 +72,9 @@ excluded_namespaces:
|
|||||||
- vault
|
- vault
|
||||||
- postgres
|
- postgres
|
||||||
- maintenance
|
- maintenance
|
||||||
|
startup:
|
||||||
|
api_wait_seconds: 1200
|
||||||
|
api_poll_seconds: 2
|
||||||
shutdown:
|
shutdown:
|
||||||
default_budget_seconds: 300
|
default_budget_seconds: 300
|
||||||
skip_etcd_snapshot: false
|
skip_etcd_snapshot: false
|
||||||
@ -56,6 +100,8 @@ coordination:
|
|||||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||||
fallback_local_shutdown: true
|
fallback_local_shutdown: true
|
||||||
command_timeout_seconds: 25
|
command_timeout_seconds: 25
|
||||||
|
role: coordinator
|
||||||
|
allow_startup_on_battery: false
|
||||||
metrics:
|
metrics:
|
||||||
enabled: true
|
enabled: true
|
||||||
bind_addr: 0.0.0.0:9560
|
bind_addr: 0.0.0.0:9560
|
||||||
@ -64,3 +110,4 @@ state:
|
|||||||
dir: /var/lib/hecate
|
dir: /var/lib/hecate
|
||||||
run_history_path: /var/lib/hecate/runs.json
|
run_history_path: /var/lib/hecate/runs.json
|
||||||
lock_path: /var/lib/hecate/hecate.lock
|
lock_path: /var/lib/hecate/hecate.lock
|
||||||
|
intent_path: /var/lib/hecate/intent.json
|
||||||
|
|||||||
@ -73,20 +73,47 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
|||||||
}
|
}
|
||||||
defer o.finalizeRecord(&record, &err)
|
defer o.finalizeRecord(&record, &err)
|
||||||
|
|
||||||
|
if !o.runner.DryRun {
|
||||||
|
currentIntent, readErr := state.ReadIntent(o.cfg.State.IntentPath)
|
||||||
|
if readErr != nil {
|
||||||
|
return fmt.Errorf("read startup intent: %w", readErr)
|
||||||
|
}
|
||||||
|
if currentIntent.State == state.IntentShuttingDown {
|
||||||
|
return fmt.Errorf("startup blocked: shutdown intent is active (%s)", currentIntent.Reason)
|
||||||
|
}
|
||||||
|
if writeErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentStartupInProgress, opts.Reason, "startup"); writeErr != nil {
|
||||||
|
return fmt.Errorf("set startup intent: %w", writeErr)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if err == nil {
|
||||||
|
if writeErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentNormal, opts.Reason, "startup"); writeErr != nil {
|
||||||
|
o.log.Printf("warning: write startup completion intent failed: %v", writeErr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
o.log.Printf("startup control-planes=%s", strings.Join(o.cfg.ControlPlanes, ","))
|
||||||
|
|
||||||
|
o.reportFluxSource(ctx, opts.ForceFluxBranch)
|
||||||
|
o.startControlPlanes(ctx, o.cfg.ControlPlanes)
|
||||||
|
|
||||||
|
apiPoll := time.Duration(o.cfg.Startup.APIPollSeconds) * time.Second
|
||||||
|
apiAttempts := o.cfg.Startup.APIWaitSeconds / o.cfg.Startup.APIPollSeconds
|
||||||
|
if apiAttempts < 1 {
|
||||||
|
apiAttempts = 1
|
||||||
|
}
|
||||||
|
if err := o.waitForAPI(ctx, apiAttempts, apiPoll); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
workers, err := o.effectiveWorkers(ctx)
|
workers, err := o.effectiveWorkers(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
o.log.Printf("startup control-planes=%s workers=%s", strings.Join(o.cfg.ControlPlanes, ","), strings.Join(workers, ","))
|
o.log.Printf("startup workers=%s", strings.Join(workers, ","))
|
||||||
|
|
||||||
o.reportFluxSource(ctx, opts.ForceFluxBranch)
|
|
||||||
o.startControlPlanes(ctx, o.cfg.ControlPlanes)
|
|
||||||
o.startWorkers(ctx, workers)
|
o.startWorkers(ctx, workers)
|
||||||
|
|
||||||
if err := o.waitForAPI(ctx, 120, 2*time.Second); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.ForceFluxBranch != "" {
|
if opts.ForceFluxBranch != "" {
|
||||||
patch := fmt.Sprintf(`{"spec":{"ref":{"branch":"%s"}}}`, opts.ForceFluxBranch)
|
patch := fmt.Sprintf(`{"spec":{"ref":{"branch":"%s"}}}`, opts.ForceFluxBranch)
|
||||||
if _, err := o.kubectl(ctx, 20*time.Second, "-n", "flux-system", "patch", "gitrepository", "flux-system", "--type=merge", "-p", patch); err != nil {
|
if _, err := o.kubectl(ctx, 20*time.Second, "-n", "flux-system", "patch", "gitrepository", "flux-system", "--type=merge", "-p", patch); err != nil {
|
||||||
@ -167,6 +194,20 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err
|
|||||||
StartedAt: time.Now().UTC(),
|
StartedAt: time.Now().UTC(),
|
||||||
}
|
}
|
||||||
defer o.finalizeRecord(&record, &err)
|
defer o.finalizeRecord(&record, &err)
|
||||||
|
if !o.runner.DryRun {
|
||||||
|
if writeErr := state.MustWriteIntent(o.cfg.State.IntentPath, state.IntentShuttingDown, opts.Reason, "shutdown"); writeErr != nil {
|
||||||
|
return fmt.Errorf("set shutdown intent: %w", writeErr)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
final := state.IntentShuttingDown
|
||||||
|
if err == nil {
|
||||||
|
final = state.IntentShutdownComplete
|
||||||
|
}
|
||||||
|
if writeErr := state.MustWriteIntent(o.cfg.State.IntentPath, final, opts.Reason, "shutdown"); writeErr != nil {
|
||||||
|
o.log.Printf("warning: write shutdown completion intent failed: %v", writeErr)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
workers, err := o.effectiveWorkers(ctx)
|
workers, err := o.effectiveWorkers(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -322,6 +363,10 @@ func (o *Orchestrator) drainWorkers(ctx context.Context, workers []string) error
|
|||||||
|
|
||||||
func (o *Orchestrator) stopWorkers(ctx context.Context, workers []string) {
|
func (o *Orchestrator) stopWorkers(ctx context.Context, workers []string) {
|
||||||
for _, n := range workers {
|
for _, n := range workers {
|
||||||
|
if !o.sshManaged(n) {
|
||||||
|
o.log.Printf("skip stop k3s-agent on %s: node not in ssh_managed_nodes", n)
|
||||||
|
continue
|
||||||
|
}
|
||||||
o.bestEffort("stop k3s-agent on "+n, func() error {
|
o.bestEffort("stop k3s-agent on "+n, func() error {
|
||||||
_, err := o.ssh(ctx, n, "sudo systemctl stop k3s-agent || true")
|
_, err := o.ssh(ctx, n, "sudo systemctl stop k3s-agent || true")
|
||||||
return err
|
return err
|
||||||
@ -331,6 +376,10 @@ func (o *Orchestrator) stopWorkers(ctx context.Context, workers []string) {
|
|||||||
|
|
||||||
func (o *Orchestrator) startWorkers(ctx context.Context, workers []string) {
|
func (o *Orchestrator) startWorkers(ctx context.Context, workers []string) {
|
||||||
for _, n := range workers {
|
for _, n := range workers {
|
||||||
|
if !o.sshManaged(n) {
|
||||||
|
o.log.Printf("skip start k3s-agent on %s: node not in ssh_managed_nodes", n)
|
||||||
|
continue
|
||||||
|
}
|
||||||
o.bestEffort("start k3s-agent on "+n, func() error {
|
o.bestEffort("start k3s-agent on "+n, func() error {
|
||||||
_, err := o.ssh(ctx, n, "sudo systemctl start k3s-agent || true")
|
_, err := o.ssh(ctx, n, "sudo systemctl start k3s-agent || true")
|
||||||
return err
|
return err
|
||||||
@ -340,6 +389,10 @@ func (o *Orchestrator) startWorkers(ctx context.Context, workers []string) {
|
|||||||
|
|
||||||
func (o *Orchestrator) stopControlPlanes(ctx context.Context, cps []string) {
|
func (o *Orchestrator) stopControlPlanes(ctx context.Context, cps []string) {
|
||||||
for _, n := range cps {
|
for _, n := range cps {
|
||||||
|
if !o.sshManaged(n) {
|
||||||
|
o.log.Printf("skip stop k3s on %s: node not in ssh_managed_nodes", n)
|
||||||
|
continue
|
||||||
|
}
|
||||||
o.bestEffort("stop k3s on "+n, func() error {
|
o.bestEffort("stop k3s on "+n, func() error {
|
||||||
_, err := o.ssh(ctx, n, "sudo systemctl stop k3s || true")
|
_, err := o.ssh(ctx, n, "sudo systemctl stop k3s || true")
|
||||||
return err
|
return err
|
||||||
@ -349,6 +402,10 @@ func (o *Orchestrator) stopControlPlanes(ctx context.Context, cps []string) {
|
|||||||
|
|
||||||
func (o *Orchestrator) startControlPlanes(ctx context.Context, cps []string) {
|
func (o *Orchestrator) startControlPlanes(ctx context.Context, cps []string) {
|
||||||
for _, n := range cps {
|
for _, n := range cps {
|
||||||
|
if !o.sshManaged(n) {
|
||||||
|
o.log.Printf("skip start k3s on %s: node not in ssh_managed_nodes", n)
|
||||||
|
continue
|
||||||
|
}
|
||||||
o.bestEffort("start k3s on "+n, func() error {
|
o.bestEffort("start k3s on "+n, func() error {
|
||||||
_, err := o.ssh(ctx, n, "sudo systemctl start k3s || true")
|
_, err := o.ssh(ctx, n, "sudo systemctl start k3s || true")
|
||||||
return err
|
return err
|
||||||
@ -357,6 +414,9 @@ func (o *Orchestrator) startControlPlanes(ctx context.Context, cps []string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) takeEtcdSnapshot(ctx context.Context, node string) error {
|
func (o *Orchestrator) takeEtcdSnapshot(ctx context.Context, node string) error {
|
||||||
|
if !o.sshManaged(node) {
|
||||||
|
return fmt.Errorf("cannot run etcd snapshot on %s: node not in ssh_managed_nodes", node)
|
||||||
|
}
|
||||||
name := "pre-shutdown-" + time.Now().UTC().Format("20060102-150405")
|
name := "pre-shutdown-" + time.Now().UTC().Format("20060102-150405")
|
||||||
_, err := o.ssh(ctx, node, "sudo k3s etcd-snapshot save --name "+name)
|
_, err := o.ssh(ctx, node, "sudo k3s etcd-snapshot save --name "+name)
|
||||||
return err
|
return err
|
||||||
@ -404,11 +464,18 @@ func (o *Orchestrator) bootstrapLocal(ctx context.Context) error {
|
|||||||
for _, rel := range o.cfg.LocalBootstrapPaths {
|
for _, rel := range o.cfg.LocalBootstrapPaths {
|
||||||
full := filepath.Join(o.cfg.IACRepoPath, rel)
|
full := filepath.Join(o.cfg.IACRepoPath, rel)
|
||||||
o.log.Printf("local bootstrap apply -k %s", full)
|
o.log.Printf("local bootstrap apply -k %s", full)
|
||||||
if _, err := o.kubectl(ctx, 2*time.Minute, "apply", "-k", full); err != nil {
|
if o.runner.DryRun {
|
||||||
failures++
|
|
||||||
o.log.Printf("warning: local bootstrap apply failed at %s: %v", full, err)
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if _, err := o.kubectl(ctx, 2*time.Minute, "apply", "-k", full); err != nil {
|
||||||
|
o.log.Printf("warning: local bootstrap apply failed at %s: %v", full, err)
|
||||||
|
o.log.Printf("local bootstrap fallback render/apply with LoadRestrictionsNone for %s", full)
|
||||||
|
if fallbackErr := o.applyKustomizeFallback(ctx, full); fallbackErr != nil {
|
||||||
|
failures++
|
||||||
|
o.log.Printf("warning: local bootstrap fallback failed at %s: %v", full, fallbackErr)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if failures == len(o.cfg.LocalBootstrapPaths) {
|
if failures == len(o.cfg.LocalBootstrapPaths) {
|
||||||
return fmt.Errorf("local bootstrap apply failed for every configured path (%d total)", failures)
|
return fmt.Errorf("local bootstrap apply failed for every configured path (%d total)", failures)
|
||||||
@ -416,6 +483,14 @@ func (o *Orchestrator) bootstrapLocal(ctx context.Context) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) applyKustomizeFallback(ctx context.Context, full string) error {
|
||||||
|
cmd := fmt.Sprintf("kubectl kustomize --load-restrictor=LoadRestrictionsNone %q | kubectl apply -f -", full)
|
||||||
|
if _, err := o.runSensitive(ctx, 3*time.Minute, "sh", "-lc", cmd); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) waitForFluxSourceReady(ctx context.Context, window time.Duration) (bool, error) {
|
func (o *Orchestrator) waitForFluxSourceReady(ctx context.Context, window time.Duration) (bool, error) {
|
||||||
if o.runner.DryRun {
|
if o.runner.DryRun {
|
||||||
return true, nil
|
return true, nil
|
||||||
@ -485,11 +560,66 @@ func (o *Orchestrator) kubectl(ctx context.Context, timeout time.Duration, args
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) ssh(ctx context.Context, node string, command string) (string, error) {
|
func (o *Orchestrator) ssh(ctx context.Context, node string, command string) (string, error) {
|
||||||
target := node
|
host := node
|
||||||
if o.cfg.SSHUser != "" {
|
if mapped, ok := o.cfg.SSHNodeHosts[node]; ok && strings.TrimSpace(mapped) != "" {
|
||||||
target = o.cfg.SSHUser + "@" + node
|
host = strings.TrimSpace(mapped)
|
||||||
}
|
}
|
||||||
return o.run(ctx, 45*time.Second, "ssh", "-o", "BatchMode=yes", "-o", "ConnectTimeout=8", target, command)
|
sshUser := o.cfg.SSHUser
|
||||||
|
if override, ok := o.cfg.SSHNodeUsers[node]; ok && strings.TrimSpace(override) != "" {
|
||||||
|
sshUser = strings.TrimSpace(override)
|
||||||
|
}
|
||||||
|
target := host
|
||||||
|
if sshUser != "" {
|
||||||
|
target = sshUser + "@" + host
|
||||||
|
}
|
||||||
|
baseArgs := []string{
|
||||||
|
"-o", "BatchMode=yes",
|
||||||
|
"-o", "ConnectTimeout=8",
|
||||||
|
"-o", "StrictHostKeyChecking=accept-new",
|
||||||
|
}
|
||||||
|
if o.cfg.SSHIdentityFile != "" {
|
||||||
|
baseArgs = append(baseArgs, "-i", o.cfg.SSHIdentityFile)
|
||||||
|
}
|
||||||
|
if o.cfg.SSHPort > 0 {
|
||||||
|
baseArgs = append(baseArgs, "-p", strconv.Itoa(o.cfg.SSHPort))
|
||||||
|
}
|
||||||
|
attempts := make([][]string, 0, 2)
|
||||||
|
attemptNames := make([]string, 0, 2)
|
||||||
|
if o.cfg.SSHJumpHost != "" {
|
||||||
|
jump := o.cfg.SSHJumpHost
|
||||||
|
if o.cfg.SSHJumpUser != "" {
|
||||||
|
jump = o.cfg.SSHJumpUser + "@" + jump
|
||||||
|
}
|
||||||
|
if o.cfg.SSHPort > 0 && !strings.Contains(jump, ":") {
|
||||||
|
jump = fmt.Sprintf("%s:%d", jump, o.cfg.SSHPort)
|
||||||
|
}
|
||||||
|
withJump := append([]string{}, baseArgs...)
|
||||||
|
withJump = append(withJump, "-J", jump, target, command)
|
||||||
|
attempts = append(attempts, withJump)
|
||||||
|
attemptNames = append(attemptNames, "jump")
|
||||||
|
}
|
||||||
|
direct := append([]string{}, baseArgs...)
|
||||||
|
direct = append(direct, target, command)
|
||||||
|
attempts = append(attempts, direct)
|
||||||
|
attemptNames = append(attemptNames, "direct")
|
||||||
|
|
||||||
|
var lastOut string
|
||||||
|
var lastErr error
|
||||||
|
for i, args := range attempts {
|
||||||
|
out, err := o.run(ctx, 45*time.Second, "ssh", args...)
|
||||||
|
if err == nil {
|
||||||
|
if i > 0 {
|
||||||
|
o.log.Printf("warning: ssh %s path failed for %s, using %s path", attemptNames[i-1], node, attemptNames[i])
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
lastOut = out
|
||||||
|
lastErr = err
|
||||||
|
if i < len(attempts)-1 {
|
||||||
|
o.log.Printf("warning: ssh %s path failed for %s: %v; trying %s path", attemptNames[i], node, err, attemptNames[i+1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return lastOut, lastErr
|
||||||
}
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) run(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
func (o *Orchestrator) run(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
@ -534,6 +664,18 @@ func lines(in string) []string {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) sshManaged(node string) bool {
|
||||||
|
if len(o.cfg.SSHManagedNodes) == 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
for _, allowed := range o.cfg.SSHManagedNodes {
|
||||||
|
if strings.TrimSpace(allowed) == node {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) bestEffort(name string, fn func() error) {
|
func (o *Orchestrator) bestEffort(name string, fn func() error) {
|
||||||
if err := fn(); err != nil {
|
if err := fn(); err != nil {
|
||||||
o.log.Printf("warning: %s: %v", name, err)
|
o.log.Printf("warning: %s: %v", name, err)
|
||||||
|
|||||||
@ -8,19 +8,32 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type Config struct {
|
type Config struct {
|
||||||
Kubeconfig string `yaml:"kubeconfig"`
|
Kubeconfig string `yaml:"kubeconfig"`
|
||||||
SSHUser string `yaml:"ssh_user"`
|
SSHUser string `yaml:"ssh_user"`
|
||||||
IACRepoPath string `yaml:"iac_repo_path"`
|
SSHPort int `yaml:"ssh_port"`
|
||||||
ExpectedFluxBranch string `yaml:"expected_flux_branch"`
|
SSHIdentityFile string `yaml:"ssh_identity_file"`
|
||||||
ControlPlanes []string `yaml:"control_planes"`
|
SSHNodeHosts map[string]string `yaml:"ssh_node_hosts"`
|
||||||
Workers []string `yaml:"workers"`
|
SSHNodeUsers map[string]string `yaml:"ssh_node_users"`
|
||||||
LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"`
|
SSHManagedNodes []string `yaml:"ssh_managed_nodes"`
|
||||||
ExcludedNamespaces []string `yaml:"excluded_namespaces"`
|
SSHJumpHost string `yaml:"ssh_jump_host"`
|
||||||
Shutdown Shutdown `yaml:"shutdown"`
|
SSHJumpUser string `yaml:"ssh_jump_user"`
|
||||||
UPS UPS `yaml:"ups"`
|
IACRepoPath string `yaml:"iac_repo_path"`
|
||||||
Coordination Coordination `yaml:"coordination"`
|
ExpectedFluxBranch string `yaml:"expected_flux_branch"`
|
||||||
Metrics Metrics `yaml:"metrics"`
|
ControlPlanes []string `yaml:"control_planes"`
|
||||||
State State `yaml:"state"`
|
Workers []string `yaml:"workers"`
|
||||||
|
LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"`
|
||||||
|
ExcludedNamespaces []string `yaml:"excluded_namespaces"`
|
||||||
|
Startup Startup `yaml:"startup"`
|
||||||
|
Shutdown Shutdown `yaml:"shutdown"`
|
||||||
|
UPS UPS `yaml:"ups"`
|
||||||
|
Coordination Coordination `yaml:"coordination"`
|
||||||
|
Metrics Metrics `yaml:"metrics"`
|
||||||
|
State State `yaml:"state"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type Startup struct {
|
||||||
|
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
||||||
|
APIPollSeconds int `yaml:"api_poll_seconds"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Shutdown struct {
|
type Shutdown struct {
|
||||||
@ -55,6 +68,8 @@ type Coordination struct {
|
|||||||
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
|
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
|
||||||
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
|
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
|
||||||
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
|
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
|
||||||
|
Role string `yaml:"role"`
|
||||||
|
AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Metrics struct {
|
type Metrics struct {
|
||||||
@ -67,6 +82,7 @@ type State struct {
|
|||||||
Dir string `yaml:"dir"`
|
Dir string `yaml:"dir"`
|
||||||
RunHistoryPath string `yaml:"run_history_path"`
|
RunHistoryPath string `yaml:"run_history_path"`
|
||||||
LockPath string `yaml:"lock_path"`
|
LockPath string `yaml:"lock_path"`
|
||||||
|
IntentPath string `yaml:"intent_path"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func Load(path string) (Config, error) {
|
func Load(path string) (Config, error) {
|
||||||
@ -100,6 +116,15 @@ func (c Config) Validate() error {
|
|||||||
if c.Shutdown.DefaultBudgetSeconds <= 0 {
|
if c.Shutdown.DefaultBudgetSeconds <= 0 {
|
||||||
return fmt.Errorf("config.shutdown.default_budget_seconds must be > 0")
|
return fmt.Errorf("config.shutdown.default_budget_seconds must be > 0")
|
||||||
}
|
}
|
||||||
|
if c.Startup.APIWaitSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.api_wait_seconds must be > 0")
|
||||||
|
}
|
||||||
|
if c.Startup.APIPollSeconds <= 0 {
|
||||||
|
return fmt.Errorf("config.startup.api_poll_seconds must be > 0")
|
||||||
|
}
|
||||||
|
if c.SSHPort <= 0 || c.SSHPort > 65535 {
|
||||||
|
return fmt.Errorf("config.ssh_port must be in range 1-65535")
|
||||||
|
}
|
||||||
if c.UPS.Enabled {
|
if c.UPS.Enabled {
|
||||||
if c.UPS.Provider == "" {
|
if c.UPS.Provider == "" {
|
||||||
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
|
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
|
||||||
@ -118,9 +143,15 @@ func (c Config) Validate() error {
|
|||||||
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
|
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if c.Coordination.Role != "coordinator" && c.Coordination.Role != "peer" {
|
||||||
|
return fmt.Errorf("config.coordination.role must be coordinator or peer")
|
||||||
|
}
|
||||||
if c.State.RunHistoryPath == "" || c.State.LockPath == "" {
|
if c.State.RunHistoryPath == "" || c.State.LockPath == "" {
|
||||||
return fmt.Errorf("config.state.run_history_path and config.state.lock_path must not be empty")
|
return fmt.Errorf("config.state.run_history_path and config.state.lock_path must not be empty")
|
||||||
}
|
}
|
||||||
|
if c.State.IntentPath == "" {
|
||||||
|
return fmt.Errorf("config.state.intent_path must not be empty")
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -128,10 +159,11 @@ func defaults() Config {
|
|||||||
c := Config{
|
c := Config{
|
||||||
IACRepoPath: "/opt/titan-iac",
|
IACRepoPath: "/opt/titan-iac",
|
||||||
ExpectedFluxBranch: "main",
|
ExpectedFluxBranch: "main",
|
||||||
|
SSHPort: 2277,
|
||||||
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
|
||||||
LocalBootstrapPaths: []string{
|
LocalBootstrapPaths: []string{
|
||||||
"infrastructure/core",
|
"infrastructure/core",
|
||||||
"infrastructure/flux-system",
|
"clusters/atlas/flux-system",
|
||||||
"infrastructure/sources/helm",
|
"infrastructure/sources/helm",
|
||||||
"infrastructure/metallb",
|
"infrastructure/metallb",
|
||||||
"infrastructure/traefik",
|
"infrastructure/traefik",
|
||||||
@ -154,6 +186,10 @@ func defaults() Config {
|
|||||||
"postgres",
|
"postgres",
|
||||||
"maintenance",
|
"maintenance",
|
||||||
},
|
},
|
||||||
|
Startup: Startup{
|
||||||
|
APIWaitSeconds: 1200,
|
||||||
|
APIPollSeconds: 2,
|
||||||
|
},
|
||||||
Shutdown: Shutdown{
|
Shutdown: Shutdown{
|
||||||
DefaultBudgetSeconds: 300,
|
DefaultBudgetSeconds: 300,
|
||||||
PoweroffEnabled: true,
|
PoweroffEnabled: true,
|
||||||
@ -172,6 +208,8 @@ func defaults() Config {
|
|||||||
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
|
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
|
||||||
FallbackLocalShutdown: true,
|
FallbackLocalShutdown: true,
|
||||||
CommandTimeoutSeconds: 25,
|
CommandTimeoutSeconds: 25,
|
||||||
|
Role: "coordinator",
|
||||||
|
AllowStartupOnBattery: false,
|
||||||
},
|
},
|
||||||
Metrics: Metrics{
|
Metrics: Metrics{
|
||||||
Enabled: true,
|
Enabled: true,
|
||||||
@ -182,6 +220,7 @@ func defaults() Config {
|
|||||||
Dir: "/var/lib/hecate",
|
Dir: "/var/lib/hecate",
|
||||||
RunHistoryPath: "/var/lib/hecate/runs.json",
|
RunHistoryPath: "/var/lib/hecate/runs.json",
|
||||||
LockPath: "/var/lib/hecate/hecate.lock",
|
LockPath: "/var/lib/hecate/hecate.lock",
|
||||||
|
IntentPath: "/var/lib/hecate/intent.json",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
c.applyDefaults()
|
c.applyDefaults()
|
||||||
@ -195,6 +234,15 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.IACRepoPath == "" {
|
if c.IACRepoPath == "" {
|
||||||
c.IACRepoPath = "/opt/titan-iac"
|
c.IACRepoPath = "/opt/titan-iac"
|
||||||
}
|
}
|
||||||
|
if c.Startup.APIWaitSeconds <= 0 {
|
||||||
|
c.Startup.APIWaitSeconds = 1200
|
||||||
|
}
|
||||||
|
if c.Startup.APIPollSeconds <= 0 {
|
||||||
|
c.Startup.APIPollSeconds = 2
|
||||||
|
}
|
||||||
|
if c.SSHPort <= 0 {
|
||||||
|
c.SSHPort = 2277
|
||||||
|
}
|
||||||
if c.Shutdown.DefaultBudgetSeconds <= 0 {
|
if c.Shutdown.DefaultBudgetSeconds <= 0 {
|
||||||
c.Shutdown.DefaultBudgetSeconds = 300
|
c.Shutdown.DefaultBudgetSeconds = 300
|
||||||
}
|
}
|
||||||
@ -219,6 +267,9 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.Coordination.CommandTimeoutSeconds <= 0 {
|
if c.Coordination.CommandTimeoutSeconds <= 0 {
|
||||||
c.Coordination.CommandTimeoutSeconds = 25
|
c.Coordination.CommandTimeoutSeconds = 25
|
||||||
}
|
}
|
||||||
|
if c.Coordination.Role == "" {
|
||||||
|
c.Coordination.Role = "coordinator"
|
||||||
|
}
|
||||||
if c.Metrics.BindAddr == "" {
|
if c.Metrics.BindAddr == "" {
|
||||||
c.Metrics.BindAddr = "0.0.0.0:9560"
|
c.Metrics.BindAddr = "0.0.0.0:9560"
|
||||||
}
|
}
|
||||||
@ -234,4 +285,7 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.State.LockPath == "" {
|
if c.State.LockPath == "" {
|
||||||
c.State.LockPath = "/var/lib/hecate/hecate.lock"
|
c.State.LockPath = "/var/lib/hecate/hecate.lock"
|
||||||
}
|
}
|
||||||
|
if c.State.IntentPath == "" {
|
||||||
|
c.State.IntentPath = "/var/lib/hecate/intent.json"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -47,3 +47,11 @@ func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) {
|
|||||||
t.Fatalf("expected validation error for missing forward_shutdown_config")
|
t.Fatalf("expected validation error for missing forward_shutdown_config")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestValidateRejectsUnknownRole(t *testing.T) {
|
||||||
|
cfg := defaults()
|
||||||
|
cfg.Coordination.Role = "unknown"
|
||||||
|
if err := cfg.Validate(); err == nil {
|
||||||
|
t.Fatalf("expected validation error for unknown coordination role")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -7,12 +7,14 @@ import (
|
|||||||
"math"
|
"math"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"scm.bstein.dev/bstein/hecate/internal/cluster"
|
"scm.bstein.dev/bstein/hecate/internal/cluster"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/config"
|
"scm.bstein.dev/bstein/hecate/internal/config"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/metrics"
|
"scm.bstein.dev/bstein/hecate/internal/metrics"
|
||||||
|
"scm.bstein.dev/bstein/hecate/internal/state"
|
||||||
"scm.bstein.dev/bstein/hecate/internal/ups"
|
"scm.bstein.dev/bstein/hecate/internal/ups"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -149,10 +151,22 @@ func (d *Daemon) Run(ctx context.Context) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error {
|
func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error {
|
||||||
|
intent, err := state.ReadIntent(d.cfg.State.IntentPath)
|
||||||
|
if err == nil && intent.State == state.IntentShuttingDown {
|
||||||
|
d.log.Printf("shutdown already in progress; skipping duplicate trigger: %s", reason)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err := state.MustWriteIntent(d.cfg.State.IntentPath, state.IntentShuttingDown, reason, "daemon"); err != nil {
|
||||||
|
d.log.Printf("warning: unable to persist shutdown intent before trigger: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
d.log.Printf("triggering shutdown: %s", reason)
|
d.log.Printf("triggering shutdown: %s", reason)
|
||||||
d.exporter.MarkShutdown(reason)
|
d.exporter.MarkShutdown(reason)
|
||||||
if d.cfg.Coordination.ForwardShutdownHost != "" {
|
if d.cfg.Coordination.ForwardShutdownHost != "" {
|
||||||
if err := d.forwardShutdown(ctx, reason); err == nil {
|
if err := d.forwardShutdown(ctx, reason); err == nil {
|
||||||
|
if setErr := state.MustWriteIntent(d.cfg.State.IntentPath, state.IntentShutdownComplete, reason, "daemon-forwarded"); setErr != nil {
|
||||||
|
d.log.Printf("warning: unable to persist forwarded shutdown completion intent: %v", setErr)
|
||||||
|
}
|
||||||
d.log.Printf("shutdown trigger forwarded to %s", d.cfg.Coordination.ForwardShutdownHost)
|
d.log.Printf("shutdown trigger forwarded to %s", d.cfg.Coordination.ForwardShutdownHost)
|
||||||
return nil
|
return nil
|
||||||
} else if !d.cfg.Coordination.FallbackLocalShutdown {
|
} else if !d.cfg.Coordination.FallbackLocalShutdown {
|
||||||
@ -161,14 +175,16 @@ func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error {
|
|||||||
d.log.Printf("warning: forward shutdown failed; falling back to local shutdown: %v", err)
|
d.log.Printf("warning: forward shutdown failed; falling back to local shutdown: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return d.orch.Shutdown(ctx, cluster.ShutdownOptions{Reason: reason})
|
if err := d.orch.Shutdown(ctx, cluster.ShutdownOptions{Reason: reason}); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if setErr := state.MustWriteIntent(d.cfg.State.IntentPath, state.IntentShutdownComplete, reason, "daemon-local"); setErr != nil {
|
||||||
|
d.log.Printf("warning: unable to persist local shutdown completion intent: %v", setErr)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
|
func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
|
||||||
userHost := d.cfg.Coordination.ForwardShutdownHost
|
|
||||||
if d.cfg.Coordination.ForwardShutdownUser != "" {
|
|
||||||
userHost = d.cfg.Coordination.ForwardShutdownUser + "@" + userHost
|
|
||||||
}
|
|
||||||
timeout := time.Duration(d.cfg.Coordination.CommandTimeoutSeconds) * time.Second
|
timeout := time.Duration(d.cfg.Coordination.CommandTimeoutSeconds) * time.Second
|
||||||
if timeout <= 0 {
|
if timeout <= 0 {
|
||||||
timeout = 25 * time.Second
|
timeout = 25 * time.Second
|
||||||
@ -181,7 +197,46 @@ func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
|
|||||||
d.cfg.Coordination.ForwardShutdownConfig,
|
d.cfg.Coordination.ForwardShutdownConfig,
|
||||||
reason,
|
reason,
|
||||||
)
|
)
|
||||||
cmd := exec.CommandContext(runCtx, "ssh", "-o", "BatchMode=yes", "-o", "ConnectTimeout=8", userHost, remoteCmd)
|
host := d.cfg.Coordination.ForwardShutdownHost
|
||||||
|
if mapped, ok := d.cfg.SSHNodeHosts[host]; ok && strings.TrimSpace(mapped) != "" {
|
||||||
|
host = strings.TrimSpace(mapped)
|
||||||
|
}
|
||||||
|
user := d.cfg.Coordination.ForwardShutdownUser
|
||||||
|
if user == "" {
|
||||||
|
if override, ok := d.cfg.SSHNodeUsers[d.cfg.Coordination.ForwardShutdownHost]; ok && strings.TrimSpace(override) != "" {
|
||||||
|
user = strings.TrimSpace(override)
|
||||||
|
} else {
|
||||||
|
user = d.cfg.SSHUser
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
target := host
|
||||||
|
if user != "" {
|
||||||
|
target = user + "@" + host
|
||||||
|
}
|
||||||
|
args := []string{
|
||||||
|
"-o", "BatchMode=yes",
|
||||||
|
"-o", "ConnectTimeout=8",
|
||||||
|
"-o", "StrictHostKeyChecking=accept-new",
|
||||||
|
}
|
||||||
|
if d.cfg.SSHIdentityFile != "" {
|
||||||
|
args = append(args, "-i", d.cfg.SSHIdentityFile)
|
||||||
|
}
|
||||||
|
if d.cfg.SSHPort > 0 {
|
||||||
|
args = append(args, "-p", strconv.Itoa(d.cfg.SSHPort))
|
||||||
|
}
|
||||||
|
if d.cfg.SSHJumpHost != "" {
|
||||||
|
jump := d.cfg.SSHJumpHost
|
||||||
|
if d.cfg.SSHJumpUser != "" {
|
||||||
|
jump = d.cfg.SSHJumpUser + "@" + jump
|
||||||
|
}
|
||||||
|
if d.cfg.SSHPort > 0 && !strings.Contains(jump, ":") {
|
||||||
|
jump = fmt.Sprintf("%s:%d", jump, d.cfg.SSHPort)
|
||||||
|
}
|
||||||
|
args = append(args, "-J", jump)
|
||||||
|
}
|
||||||
|
args = append(args, target, remoteCmd)
|
||||||
|
cmd := exec.CommandContext(runCtx, "ssh", args...)
|
||||||
out, err := cmd.CombinedOutput()
|
out, err := cmd.CombinedOutput()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
trimmed := strings.TrimSpace(string(out))
|
trimmed := strings.TrimSpace(string(out))
|
||||||
|
|||||||
69
internal/state/intent.go
Normal file
69
internal/state/intent.go
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
package state
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
IntentNormal = "normal"
|
||||||
|
IntentStartupInProgress = "startup_in_progress"
|
||||||
|
IntentShuttingDown = "shutting_down"
|
||||||
|
IntentShutdownComplete = "shutdown_complete"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Intent struct {
|
||||||
|
State string `json:"state"`
|
||||||
|
Reason string `json:"reason,omitempty"`
|
||||||
|
Source string `json:"source,omitempty"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func ReadIntent(path string) (Intent, error) {
|
||||||
|
b, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
return Intent{}, nil
|
||||||
|
}
|
||||||
|
return Intent{}, err
|
||||||
|
}
|
||||||
|
if len(b) == 0 {
|
||||||
|
return Intent{}, nil
|
||||||
|
}
|
||||||
|
var in Intent
|
||||||
|
if err := json.Unmarshal(b, &in); err != nil {
|
||||||
|
return Intent{}, err
|
||||||
|
}
|
||||||
|
return in, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func WriteIntent(path string, in Intent) error {
|
||||||
|
if in.UpdatedAt.IsZero() {
|
||||||
|
in.UpdatedAt = time.Now().UTC()
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
b, err := json.MarshalIndent(in, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(path, b, 0o640)
|
||||||
|
}
|
||||||
|
|
||||||
|
func MustWriteIntent(path string, state string, reason string, source string) error {
|
||||||
|
switch state {
|
||||||
|
case IntentNormal, IntentStartupInProgress, IntentShuttingDown, IntentShutdownComplete:
|
||||||
|
default:
|
||||||
|
return fmt.Errorf("invalid intent state: %s", state)
|
||||||
|
}
|
||||||
|
return WriteIntent(path, Intent{
|
||||||
|
State: state,
|
||||||
|
Reason: reason,
|
||||||
|
Source: source,
|
||||||
|
UpdatedAt: time.Now().UTC(),
|
||||||
|
})
|
||||||
|
}
|
||||||
30
internal/state/intent_test.go
Normal file
30
internal/state/intent_test.go
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
package state
|
||||||
|
|
||||||
|
import (
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestWriteReadIntentRoundTrip(t *testing.T) {
|
||||||
|
p := filepath.Join(t.TempDir(), "intent.json")
|
||||||
|
if err := MustWriteIntent(p, IntentShuttingDown, "ups-threshold", "daemon"); err != nil {
|
||||||
|
t.Fatalf("write intent: %v", err)
|
||||||
|
}
|
||||||
|
in, err := ReadIntent(p)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read intent: %v", err)
|
||||||
|
}
|
||||||
|
if in.State != IntentShuttingDown {
|
||||||
|
t.Fatalf("expected state %q, got %q", IntentShuttingDown, in.State)
|
||||||
|
}
|
||||||
|
if in.Source != "daemon" {
|
||||||
|
t.Fatalf("expected source daemon, got %q", in.Source)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMustWriteIntentRejectsUnknownState(t *testing.T) {
|
||||||
|
p := filepath.Join(t.TempDir(), "intent.json")
|
||||||
|
if err := MustWriteIntent(p, "weird", "x", "y"); err == nil {
|
||||||
|
t.Fatalf("expected invalid state error")
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -19,6 +19,7 @@ Drills:
|
|||||||
flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery.
|
flux-gitea-deadlock Simulate flux-controller + gitea outage and require startup recovery.
|
||||||
foundation-recovery Simulate vault/postgres/gitea outage and require layered restore.
|
foundation-recovery Simulate vault/postgres/gitea outage and require layered restore.
|
||||||
reconciliation-resume Simulate global Flux suspend + source-controller down and require resume.
|
reconciliation-resume Simulate global Flux suspend + source-controller down and require resume.
|
||||||
|
startup-intent-guard Assert startup is blocked when shutdown intent is active.
|
||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
- Drills are intentionally disruptive and are not part of regular `make test`.
|
- Drills are intentionally disruptive and are not part of regular `make test`.
|
||||||
@ -73,7 +74,7 @@ wait_ready() {
|
|||||||
|
|
||||||
run_hecate_startup() {
|
run_hecate_startup() {
|
||||||
local reason="$1"
|
local reason="$1"
|
||||||
local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main)
|
local cmd=(sudo "${HECATE_BIN}" startup --config "${HECATE_CONFIG}" --execute --force-flux-branch main --reason "${reason}")
|
||||||
if [[ "${EXECUTE}" -eq 0 ]]; then
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
|
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${cmd[*]}'"
|
||||||
return 0
|
return 0
|
||||||
@ -272,6 +273,41 @@ run_drill_reconciliation_resume() {
|
|||||||
ROLLBACK_FLUX_SUSPEND=0
|
ROLLBACK_FLUX_SUSPEND=0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
run_drill_startup_intent_guard() {
|
||||||
|
local intent_path="/var/lib/hecate/intent.json"
|
||||||
|
local backup_path="/tmp/hecate-intent-pre-drill.json"
|
||||||
|
local inject_cmd="
|
||||||
|
if [ -f '${intent_path}' ]; then sudo cp '${intent_path}' '${backup_path}'; else sudo rm -f '${backup_path}'; fi
|
||||||
|
cat <<'JSON' | sudo tee '${intent_path}' >/dev/null
|
||||||
|
{\"state\":\"shutting_down\",\"reason\":\"drill-intent-guard\",\"source\":\"drill\",\"updated_at\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}
|
||||||
|
JSON
|
||||||
|
"
|
||||||
|
local restore_cmd="
|
||||||
|
if [ -f '${backup_path}' ]; then
|
||||||
|
sudo mv '${backup_path}' '${intent_path}'
|
||||||
|
else
|
||||||
|
sudo rm -f '${intent_path}'
|
||||||
|
fi
|
||||||
|
"
|
||||||
|
local startup_cmd="sudo ${HECATE_BIN} startup --config ${HECATE_CONFIG} --execute --force-flux-branch main --reason drill-startup-intent-guard"
|
||||||
|
|
||||||
|
if [[ "${EXECUTE}" -eq 0 ]]; then
|
||||||
|
log "plan: ssh ${HECATE_COORDINATOR_HOST} '<inject shutdown intent>'"
|
||||||
|
log "plan: ssh ${HECATE_COORDINATOR_HOST} '${startup_cmd}' (expect failure)"
|
||||||
|
log "plan: ssh ${HECATE_COORDINATOR_HOST} '<restore prior intent>'"
|
||||||
|
log "pass: startup-intent-guard (plan mode)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
ssh "${HECATE_COORDINATOR_HOST}" "bash -lc ${inject_cmd@Q}"
|
||||||
|
if ssh "${HECATE_COORDINATOR_HOST}" "bash -lc ${startup_cmd@Q}"; then
|
||||||
|
ssh "${HECATE_COORDINATOR_HOST}" "bash -lc ${restore_cmd@Q}" || true
|
||||||
|
die "startup-intent-guard failed: startup unexpectedly succeeded while shutdown intent was active"
|
||||||
|
fi
|
||||||
|
ssh "${HECATE_COORDINATOR_HOST}" "bash -lc ${restore_cmd@Q}"
|
||||||
|
log "pass: startup-intent-guard"
|
||||||
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
need_cmd "${KUBECTL}"
|
need_cmd "${KUBECTL}"
|
||||||
need_cmd ssh
|
need_cmd ssh
|
||||||
@ -315,6 +351,9 @@ main() {
|
|||||||
reconciliation-resume)
|
reconciliation-resume)
|
||||||
run_drill_reconciliation_resume
|
run_drill_reconciliation_resume
|
||||||
;;
|
;;
|
||||||
|
startup-intent-guard)
|
||||||
|
run_drill_startup_intent_guard
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
die "unknown drill: ${drill}"
|
die "unknown drill: ${drill}"
|
||||||
;;
|
;;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user