hecate: add multi-ups coordination, poweroff, metrics, and declarative self-update install
This commit is contained in:
parent
fbdb2c269b
commit
27c7d119c0
23
README.md
23
README.md
@ -6,6 +6,8 @@ It runs on `titan-db` and handles:
|
||||
- Staged **startup** (including Flux/Gitea bootstrap deadlock fallback)
|
||||
- Graceful **shutdown**
|
||||
- UPS-driven automatic shutdown decisions based on discharge/runtime
|
||||
- Multi-UPS operation via multiple Hecate instances (for example `titan-db` + `tethys`)
|
||||
- Full hardware poweroff sequencing after graceful Kubernetes shutdown
|
||||
|
||||
## Why host-level
|
||||
|
||||
@ -24,11 +26,17 @@ Hecate runs outside the cluster under systemd, so it can always orchestrate brin
|
||||
```bash
|
||||
git clone git@gitea-admin:bstein/hecate.git
|
||||
cd hecate
|
||||
sudo ./scripts/install.sh
|
||||
sudo HECATE_ENABLE_BOOTSTRAP=1 ./scripts/install.sh
|
||||
sudoedit /etc/hecate/hecate.yaml
|
||||
sudo systemctl restart hecate.service
|
||||
```
|
||||
|
||||
The installer is idempotent:
|
||||
- Re-runs safely on every update
|
||||
- Preserves existing `/etc/hecate/hecate.yaml`
|
||||
- Ensures required dependencies are installed (`kubectl`, `nut-*`, `ssh`, `go`, etc.)
|
||||
- Installs/refreshes systemd units and enables boot-time self-update
|
||||
|
||||
Bootstrap now (without reboot):
|
||||
|
||||
```bash
|
||||
@ -44,6 +52,13 @@ sudo systemctl start hecate-bootstrap.service
|
||||
- `systemctl start/stop k3s-agent`
|
||||
- UPS telemetry available via NUT (`upsc`)
|
||||
|
||||
## Multi-UPS topology
|
||||
|
||||
Recommended:
|
||||
- `titan-db` runs Hecate as the shutdown coordinator (local UPS target + local shutdown execution).
|
||||
- `tethys` runs Hecate with local UPS target and forwards shutdown triggers to `titan-db`.
|
||||
- If forwarding fails, fallback local shutdown can remain enabled.
|
||||
|
||||
## Config
|
||||
|
||||
See `configs/hecate.example.yaml`.
|
||||
@ -55,7 +70,13 @@ UPS auto-shutdown trigger uses:
|
||||
|
||||
Estimated shutdown budget is derived from historical successful shutdown runs (`/var/lib/hecate/runs.json`) with default fallback from config.
|
||||
|
||||
Power metrics:
|
||||
- Hecate exposes Prometheus metrics on `:9560/metrics` by default.
|
||||
- This is intended for a dedicated Grafana power dashboard and a high-level overview row.
|
||||
|
||||
## Notes
|
||||
|
||||
- Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
|
||||
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
|
||||
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts).
|
||||
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.
|
||||
|
||||
@ -84,6 +84,7 @@ func runShutdown(logger *log.Logger, args []string) error {
|
||||
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
|
||||
skipEtcd := fs.Bool("skip-etcd-snapshot", false, "Skip etcd snapshot")
|
||||
skipDrain := fs.Bool("skip-drain", false, "Skip worker drain")
|
||||
reason := fs.String("reason", "manual-shutdown", "Shutdown reason for run history")
|
||||
_ = fs.Parse(args)
|
||||
|
||||
_, orch, err := buildOrchestrator(logger, *configPath, !*execute)
|
||||
@ -96,7 +97,7 @@ func runShutdown(logger *log.Logger, args []string) error {
|
||||
return orch.Shutdown(ctx, cluster.ShutdownOptions{
|
||||
SkipEtcdSnapshot: *skipEtcd,
|
||||
SkipDrain: *skipDrain,
|
||||
Reason: "manual-shutdown",
|
||||
Reason: *reason,
|
||||
})
|
||||
}
|
||||
|
||||
@ -113,15 +114,37 @@ func runDaemon(logger *log.Logger, args []string) error {
|
||||
if !cfg.UPS.Enabled {
|
||||
return fmt.Errorf("UPS monitoring is disabled in config")
|
||||
}
|
||||
var provider ups.Provider
|
||||
targets := make([]service.Target, 0, len(cfg.UPS.Targets)+1)
|
||||
switch cfg.UPS.Provider {
|
||||
case "nut":
|
||||
provider = ups.NewNUTProvider(cfg.UPS.Target)
|
||||
if len(cfg.UPS.Targets) == 0 {
|
||||
target := cfg.UPS.Target
|
||||
if target == "" {
|
||||
return fmt.Errorf("ups target must be set")
|
||||
}
|
||||
targets = append(targets, service.Target{
|
||||
Name: "primary",
|
||||
Target: target,
|
||||
Provider: ups.NewNUTProvider(target),
|
||||
})
|
||||
} else {
|
||||
for idx, t := range cfg.UPS.Targets {
|
||||
name := t.Name
|
||||
if name == "" {
|
||||
name = fmt.Sprintf("target-%d", idx+1)
|
||||
}
|
||||
targets = append(targets, service.Target{
|
||||
Name: name,
|
||||
Target: t.Target,
|
||||
Provider: ups.NewNUTProvider(t.Target),
|
||||
})
|
||||
}
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("unsupported UPS provider: %s", cfg.UPS.Provider)
|
||||
}
|
||||
|
||||
d := service.NewDaemon(cfg, orch, provider, logger)
|
||||
d := service.NewDaemon(cfg, orch, targets, logger)
|
||||
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
defer stop()
|
||||
return d.Run(ctx)
|
||||
@ -184,7 +207,7 @@ Commands:
|
||||
|
||||
Examples:
|
||||
hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main
|
||||
hecate shutdown --config /etc/hecate/hecate.yaml --execute
|
||||
hecate shutdown --config /etc/hecate/hecate.yaml --execute --reason "manual-maintenance"
|
||||
hecate daemon --config /etc/hecate/hecate.yaml
|
||||
hecate status --config /etc/hecate/hecate.yaml
|
||||
`)
|
||||
|
||||
@ -34,14 +34,32 @@ shutdown:
|
||||
default_budget_seconds: 300
|
||||
skip_etcd_snapshot: false
|
||||
skip_drain: false
|
||||
poweroff_enabled: true
|
||||
poweroff_delay_seconds: 25
|
||||
poweroff_local_host: true
|
||||
extra_poweroff_hosts:
|
||||
- titan-db
|
||||
ups:
|
||||
enabled: true
|
||||
provider: nut
|
||||
target: atlasups@localhost
|
||||
targets:
|
||||
- name: db-ups
|
||||
target: atlasups@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.10
|
||||
debounce_count: 3
|
||||
telemetry_timeout_seconds: 90
|
||||
coordination:
|
||||
forward_shutdown_host: ""
|
||||
forward_shutdown_user: atlas
|
||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||
fallback_local_shutdown: true
|
||||
command_timeout_seconds: 25
|
||||
metrics:
|
||||
enabled: true
|
||||
bind_addr: 0.0.0.0:9560
|
||||
path: /metrics
|
||||
state:
|
||||
dir: /var/lib/hecate
|
||||
run_history_path: /var/lib/hecate/runs.json
|
||||
|
||||
57
configs/hecate.tethys.yaml
Normal file
57
configs/hecate.tethys.yaml
Normal file
@ -0,0 +1,57 @@
|
||||
# /etc/hecate/hecate.yaml for titan-24 (tethys forwarder)
|
||||
kubeconfig: /home/tethys/.kube/config
|
||||
ssh_user: atlas
|
||||
iac_repo_path: /opt/titan-iac
|
||||
expected_flux_branch: main
|
||||
control_planes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
workers: []
|
||||
local_bootstrap_paths:
|
||||
- infrastructure/core
|
||||
excluded_namespaces:
|
||||
- kube-system
|
||||
- kube-public
|
||||
- kube-node-lease
|
||||
- flux-system
|
||||
- traefik
|
||||
- metallb-system
|
||||
- cert-manager
|
||||
- longhorn-system
|
||||
- vault
|
||||
- postgres
|
||||
- maintenance
|
||||
shutdown:
|
||||
default_budget_seconds: 300
|
||||
skip_etcd_snapshot: false
|
||||
skip_drain: false
|
||||
poweroff_enabled: true
|
||||
poweroff_delay_seconds: 25
|
||||
poweroff_local_host: true
|
||||
extra_poweroff_hosts: []
|
||||
ups:
|
||||
enabled: true
|
||||
provider: nut
|
||||
targets:
|
||||
- name: tethys-ups
|
||||
target: atlasups@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.10
|
||||
debounce_count: 3
|
||||
telemetry_timeout_seconds: 90
|
||||
coordination:
|
||||
forward_shutdown_host: titan-db
|
||||
forward_shutdown_user: atlas
|
||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||
fallback_local_shutdown: false
|
||||
command_timeout_seconds: 25
|
||||
metrics:
|
||||
enabled: true
|
||||
bind_addr: 0.0.0.0:9560
|
||||
path: /metrics
|
||||
state:
|
||||
dir: /var/lib/hecate
|
||||
run_history_path: /var/lib/hecate/runs.json
|
||||
lock_path: /var/lib/hecate/hecate.lock
|
||||
|
||||
66
configs/hecate.titan-db.yaml
Normal file
66
configs/hecate.titan-db.yaml
Normal file
@ -0,0 +1,66 @@
|
||||
# /etc/hecate/hecate.yaml for titan-db (coordinator)
|
||||
kubeconfig: /home/atlas/.kube/config
|
||||
ssh_user: atlas
|
||||
iac_repo_path: /opt/titan-iac
|
||||
expected_flux_branch: main
|
||||
control_planes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
workers: []
|
||||
local_bootstrap_paths:
|
||||
- infrastructure/core
|
||||
- infrastructure/sources/helm
|
||||
- infrastructure/metallb
|
||||
- infrastructure/traefik
|
||||
- infrastructure/vault-csi
|
||||
- infrastructure/vault-injector
|
||||
- services/vault
|
||||
- infrastructure/postgres
|
||||
- services/gitea
|
||||
excluded_namespaces:
|
||||
- kube-system
|
||||
- kube-public
|
||||
- kube-node-lease
|
||||
- flux-system
|
||||
- traefik
|
||||
- metallb-system
|
||||
- cert-manager
|
||||
- longhorn-system
|
||||
- vault
|
||||
- postgres
|
||||
- maintenance
|
||||
shutdown:
|
||||
default_budget_seconds: 300
|
||||
skip_etcd_snapshot: false
|
||||
skip_drain: false
|
||||
poweroff_enabled: true
|
||||
poweroff_delay_seconds: 25
|
||||
poweroff_local_host: true
|
||||
extra_poweroff_hosts:
|
||||
- titan-db
|
||||
ups:
|
||||
enabled: true
|
||||
provider: nut
|
||||
targets:
|
||||
- name: db-ups
|
||||
target: atlasups@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.10
|
||||
debounce_count: 3
|
||||
telemetry_timeout_seconds: 90
|
||||
coordination:
|
||||
forward_shutdown_host: ""
|
||||
forward_shutdown_user: atlas
|
||||
forward_shutdown_config: /etc/hecate/hecate.yaml
|
||||
fallback_local_shutdown: true
|
||||
command_timeout_seconds: 25
|
||||
metrics:
|
||||
enabled: true
|
||||
bind_addr: 0.0.0.0:9560
|
||||
path: /metrics
|
||||
state:
|
||||
dir: /var/lib/hecate
|
||||
run_history_path: /var/lib/hecate/runs.json
|
||||
lock_path: /var/lib/hecate/hecate.lock
|
||||
|
||||
12
deploy/systemd/hecate-update.service
Normal file
12
deploy/systemd/hecate-update.service
Normal file
@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=Hecate Self-Update and Reinstall
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=root
|
||||
Group=root
|
||||
ExecStart=/usr/local/lib/hecate/hecate-self-update.sh
|
||||
TimeoutStartSec=1800
|
||||
|
||||
12
deploy/systemd/hecate-update.timer
Normal file
12
deploy/systemd/hecate-update.timer
Normal file
@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=Periodic Hecate Self-Update Timer
|
||||
|
||||
[Timer]
|
||||
OnBootSec=2m
|
||||
OnUnitActiveSec=6h
|
||||
Unit=hecate-update.service
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
|
||||
@ -9,7 +9,7 @@ Type=simple
|
||||
User=root
|
||||
Group=root
|
||||
ExecStart=/usr/local/bin/hecate daemon --config /etc/hecate/hecate.yaml
|
||||
Restart=always
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
NoNewPrivileges=true
|
||||
|
||||
|
||||
@ -4,7 +4,9 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@ -132,6 +134,9 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err
|
||||
|
||||
o.stopWorkers(ctx, workers)
|
||||
o.stopControlPlanes(ctx, o.cfg.ControlPlanes)
|
||||
if o.cfg.Shutdown.PoweroffEnabled {
|
||||
o.bestEffort("poweroff hosts", func() error { return o.poweroffHosts(ctx, workers) })
|
||||
}
|
||||
o.log.Printf("shutdown flow complete")
|
||||
return nil
|
||||
}
|
||||
@ -413,3 +418,60 @@ func (o *Orchestrator) bestEffort(name string, fn func() error) {
|
||||
o.log.Printf("warning: %s: %v", name, err)
|
||||
}
|
||||
}
|
||||
|
||||
func (o *Orchestrator) poweroffHosts(ctx context.Context, workers []string) error {
|
||||
delay := o.cfg.Shutdown.PoweroffDelaySeconds
|
||||
if delay <= 0 {
|
||||
delay = 25
|
||||
}
|
||||
|
||||
localNames := map[string]struct{}{}
|
||||
if hn, err := os.Hostname(); err == nil && strings.TrimSpace(hn) != "" {
|
||||
localNames[strings.TrimSpace(hn)] = struct{}{}
|
||||
}
|
||||
if o.cfg.SSHUser != "" {
|
||||
localNames[o.cfg.SSHUser] = struct{}{}
|
||||
}
|
||||
|
||||
hostSet := map[string]struct{}{}
|
||||
for _, n := range o.cfg.ControlPlanes {
|
||||
hostSet[n] = struct{}{}
|
||||
}
|
||||
for _, n := range workers {
|
||||
hostSet[n] = struct{}{}
|
||||
}
|
||||
for _, n := range o.cfg.Shutdown.ExtraPoweroffHosts {
|
||||
if strings.TrimSpace(n) != "" {
|
||||
hostSet[strings.TrimSpace(n)] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
hosts := make([]string, 0, len(hostSet))
|
||||
for h := range hostSet {
|
||||
hosts = append(hosts, h)
|
||||
}
|
||||
sort.Strings(hosts)
|
||||
|
||||
remoteCmd := fmt.Sprintf(`sudo nohup sh -c 'sleep %d; systemctl poweroff' >/dev/null 2>&1 &`, delay)
|
||||
for _, host := range hosts {
|
||||
host = strings.TrimSpace(host)
|
||||
if host == "" {
|
||||
continue
|
||||
}
|
||||
if _, isLocal := localNames[host]; isLocal {
|
||||
continue
|
||||
}
|
||||
o.bestEffort("schedule poweroff on "+host, func() error {
|
||||
_, err := o.ssh(ctx, host, remoteCmd)
|
||||
return err
|
||||
})
|
||||
}
|
||||
|
||||
if o.cfg.Shutdown.PoweroffLocalHost {
|
||||
o.bestEffort("schedule local host poweroff", func() error {
|
||||
_, err := o.run(ctx, 5*time.Second, "sh", "-c", fmt.Sprintf("nohup sh -c 'sleep %d; systemctl poweroff' >/dev/null 2>&1 &", delay+10))
|
||||
return err
|
||||
})
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -18,6 +18,8 @@ type Config struct {
|
||||
ExcludedNamespaces []string `yaml:"excluded_namespaces"`
|
||||
Shutdown Shutdown `yaml:"shutdown"`
|
||||
UPS UPS `yaml:"ups"`
|
||||
Coordination Coordination `yaml:"coordination"`
|
||||
Metrics Metrics `yaml:"metrics"`
|
||||
State State `yaml:"state"`
|
||||
}
|
||||
|
||||
@ -25,18 +27,42 @@ type Shutdown struct {
|
||||
DefaultBudgetSeconds int `yaml:"default_budget_seconds"`
|
||||
SkipEtcdSnapshot bool `yaml:"skip_etcd_snapshot"`
|
||||
SkipDrain bool `yaml:"skip_drain"`
|
||||
PoweroffEnabled bool `yaml:"poweroff_enabled"`
|
||||
PoweroffDelaySeconds int `yaml:"poweroff_delay_seconds"`
|
||||
PoweroffLocalHost bool `yaml:"poweroff_local_host"`
|
||||
ExtraPoweroffHosts []string `yaml:"extra_poweroff_hosts"`
|
||||
}
|
||||
|
||||
type UPS struct {
|
||||
Enabled bool `yaml:"enabled"`
|
||||
Provider string `yaml:"provider"`
|
||||
Target string `yaml:"target"`
|
||||
Targets []UPSTarget `yaml:"targets"`
|
||||
PollSeconds int `yaml:"poll_seconds"`
|
||||
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
|
||||
DebounceCount int `yaml:"debounce_count"`
|
||||
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
|
||||
}
|
||||
|
||||
type UPSTarget struct {
|
||||
Name string `yaml:"name"`
|
||||
Target string `yaml:"target"`
|
||||
}
|
||||
|
||||
type Coordination struct {
|
||||
ForwardShutdownHost string `yaml:"forward_shutdown_host"`
|
||||
ForwardShutdownUser string `yaml:"forward_shutdown_user"`
|
||||
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
|
||||
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
|
||||
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
|
||||
}
|
||||
|
||||
type Metrics struct {
|
||||
Enabled bool `yaml:"enabled"`
|
||||
BindAddr string `yaml:"bind_addr"`
|
||||
Path string `yaml:"path"`
|
||||
}
|
||||
|
||||
type State struct {
|
||||
Dir string `yaml:"dir"`
|
||||
RunHistoryPath string `yaml:"run_history_path"`
|
||||
@ -78,8 +104,18 @@ func (c Config) Validate() error {
|
||||
if c.UPS.Provider == "" {
|
||||
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
|
||||
}
|
||||
if c.UPS.Target == "" {
|
||||
return fmt.Errorf("config.ups.target must not be empty when ups is enabled")
|
||||
if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
|
||||
return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
|
||||
}
|
||||
for _, t := range c.UPS.Targets {
|
||||
if t.Target == "" {
|
||||
return fmt.Errorf("config.ups.targets[].target must not be empty")
|
||||
}
|
||||
}
|
||||
}
|
||||
if c.Coordination.ForwardShutdownHost != "" {
|
||||
if c.Coordination.ForwardShutdownConfig == "" {
|
||||
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
|
||||
}
|
||||
}
|
||||
if c.State.RunHistoryPath == "" || c.State.LockPath == "" {
|
||||
@ -119,6 +155,9 @@ func defaults() Config {
|
||||
},
|
||||
Shutdown: Shutdown{
|
||||
DefaultBudgetSeconds: 300,
|
||||
PoweroffEnabled: true,
|
||||
PoweroffDelaySeconds: 25,
|
||||
PoweroffLocalHost: true,
|
||||
},
|
||||
UPS: UPS{
|
||||
Enabled: true,
|
||||
@ -128,6 +167,16 @@ func defaults() Config {
|
||||
DebounceCount: 3,
|
||||
TelemetryTimeoutSeconds: 90,
|
||||
},
|
||||
Coordination: Coordination{
|
||||
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
|
||||
FallbackLocalShutdown: true,
|
||||
CommandTimeoutSeconds: 25,
|
||||
},
|
||||
Metrics: Metrics{
|
||||
Enabled: true,
|
||||
BindAddr: "0.0.0.0:9560",
|
||||
Path: "/metrics",
|
||||
},
|
||||
State: State{
|
||||
Dir: "/var/lib/hecate",
|
||||
RunHistoryPath: "/var/lib/hecate/runs.json",
|
||||
@ -148,6 +197,9 @@ func (c *Config) applyDefaults() {
|
||||
if c.Shutdown.DefaultBudgetSeconds <= 0 {
|
||||
c.Shutdown.DefaultBudgetSeconds = 300
|
||||
}
|
||||
if c.Shutdown.PoweroffDelaySeconds <= 0 {
|
||||
c.Shutdown.PoweroffDelaySeconds = 25
|
||||
}
|
||||
if c.UPS.PollSeconds <= 0 {
|
||||
c.UPS.PollSeconds = 5
|
||||
}
|
||||
@ -160,6 +212,18 @@ func (c *Config) applyDefaults() {
|
||||
if c.UPS.TelemetryTimeoutSeconds <= 0 {
|
||||
c.UPS.TelemetryTimeoutSeconds = 90
|
||||
}
|
||||
if c.Coordination.ForwardShutdownConfig == "" {
|
||||
c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml"
|
||||
}
|
||||
if c.Coordination.CommandTimeoutSeconds <= 0 {
|
||||
c.Coordination.CommandTimeoutSeconds = 25
|
||||
}
|
||||
if c.Metrics.BindAddr == "" {
|
||||
c.Metrics.BindAddr = "0.0.0.0:9560"
|
||||
}
|
||||
if c.Metrics.Path == "" {
|
||||
c.Metrics.Path = "/metrics"
|
||||
}
|
||||
if c.State.Dir == "" {
|
||||
c.State.Dir = "/var/lib/hecate"
|
||||
}
|
||||
|
||||
49
internal/config/config_test.go
Normal file
49
internal/config/config_test.go
Normal file
@ -0,0 +1,49 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLoadAcceptsUPSTargets(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
cfgPath := filepath.Join(tmp, "hecate.yaml")
|
||||
raw := `
|
||||
control_planes: [titan-0a, titan-0b, titan-0c]
|
||||
expected_flux_branch: main
|
||||
iac_repo_path: /opt/titan-iac
|
||||
ups:
|
||||
enabled: true
|
||||
provider: nut
|
||||
targets:
|
||||
- name: db
|
||||
target: atlasups@localhost
|
||||
shutdown:
|
||||
default_budget_seconds: 300
|
||||
state:
|
||||
run_history_path: /tmp/runs.json
|
||||
lock_path: /tmp/hecate.lock
|
||||
`
|
||||
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
|
||||
t.Fatalf("write config: %v", err)
|
||||
}
|
||||
|
||||
cfg, err := Load(cfgPath)
|
||||
if err != nil {
|
||||
t.Fatalf("load config: %v", err)
|
||||
}
|
||||
if len(cfg.UPS.Targets) != 1 || cfg.UPS.Targets[0].Target != "atlasups@localhost" {
|
||||
t.Fatalf("unexpected UPS targets: %#v", cfg.UPS.Targets)
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) {
|
||||
cfg := defaults()
|
||||
cfg.Coordination.ForwardShutdownHost = "titan-db"
|
||||
cfg.Coordination.ForwardShutdownConfig = ""
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Fatalf("expected validation error for missing forward_shutdown_config")
|
||||
}
|
||||
}
|
||||
150
internal/metrics/exporter.go
Normal file
150
internal/metrics/exporter.go
Normal file
@ -0,0 +1,150 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Sample struct {
|
||||
Name string
|
||||
Target string
|
||||
OnBattery bool
|
||||
LowBattery bool
|
||||
RuntimeSecond int
|
||||
ThresholdSec int
|
||||
Trigger bool
|
||||
BreachCount int
|
||||
Status string
|
||||
LastError string
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type Exporter struct {
|
||||
mu sync.RWMutex
|
||||
shutdownBudgetSec int
|
||||
shutdownTriggers int
|
||||
lastShutdownReason string
|
||||
lastShutdownAt time.Time
|
||||
samples map[string]Sample
|
||||
}
|
||||
|
||||
func New() *Exporter {
|
||||
return &Exporter{
|
||||
samples: make(map[string]Sample),
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Exporter) UpdateBudget(seconds int) {
|
||||
e.mu.Lock()
|
||||
defer e.mu.Unlock()
|
||||
e.shutdownBudgetSec = seconds
|
||||
}
|
||||
|
||||
func (e *Exporter) UpdateSample(s Sample) {
|
||||
e.mu.Lock()
|
||||
defer e.mu.Unlock()
|
||||
if s.UpdatedAt.IsZero() {
|
||||
s.UpdatedAt = time.Now().UTC()
|
||||
}
|
||||
e.samples[s.Name] = s
|
||||
}
|
||||
|
||||
func (e *Exporter) MarkShutdown(reason string) {
|
||||
e.mu.Lock()
|
||||
defer e.mu.Unlock()
|
||||
e.shutdownTriggers++
|
||||
e.lastShutdownReason = reason
|
||||
e.lastShutdownAt = time.Now().UTC()
|
||||
}
|
||||
|
||||
func (e *Exporter) Handler(path string) http.Handler {
|
||||
mux := http.NewServeMux()
|
||||
metricsPath := path
|
||||
if metricsPath == "" {
|
||||
metricsPath = "/metrics"
|
||||
}
|
||||
mux.HandleFunc(metricsPath, e.serveMetrics)
|
||||
mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte("ok\n"))
|
||||
})
|
||||
return mux
|
||||
}
|
||||
|
||||
func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
|
||||
e.mu.RLock()
|
||||
defer e.mu.RUnlock()
|
||||
|
||||
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
||||
var b strings.Builder
|
||||
b.WriteString("# HELP hecate_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n")
|
||||
b.WriteString("# TYPE hecate_shutdown_budget_seconds gauge\n")
|
||||
b.WriteString(fmt.Sprintf("hecate_shutdown_budget_seconds %d\n", e.shutdownBudgetSec))
|
||||
b.WriteString("# HELP hecate_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n")
|
||||
b.WriteString("# TYPE hecate_shutdown_triggers_total counter\n")
|
||||
b.WriteString(fmt.Sprintf("hecate_shutdown_triggers_total %d\n", e.shutdownTriggers))
|
||||
b.WriteString("# HELP hecate_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n")
|
||||
b.WriteString("# TYPE hecate_shutdown_last_trigger_timestamp_seconds gauge\n")
|
||||
if e.lastShutdownAt.IsZero() {
|
||||
b.WriteString("hecate_shutdown_last_trigger_timestamp_seconds 0\n")
|
||||
} else {
|
||||
b.WriteString(fmt.Sprintf("hecate_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix()))
|
||||
}
|
||||
b.WriteString("# HELP hecate_ups_on_battery Whether a UPS source is currently on battery.\n")
|
||||
b.WriteString("# TYPE hecate_ups_on_battery gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_low_battery Whether a UPS source currently reports low battery.\n")
|
||||
b.WriteString("# TYPE hecate_ups_low_battery gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_runtime_seconds Battery runtime remaining reported by UPS.\n")
|
||||
b.WriteString("# TYPE hecate_ups_runtime_seconds gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n")
|
||||
b.WriteString("# TYPE hecate_ups_threshold_seconds gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n")
|
||||
b.WriteString("# TYPE hecate_ups_trigger_active gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_breach_count Current debounce breach count for this UPS source.\n")
|
||||
b.WriteString("# TYPE hecate_ups_breach_count gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n")
|
||||
b.WriteString("# TYPE hecate_ups_last_sample_timestamp_seconds gauge\n")
|
||||
b.WriteString("# HELP hecate_ups_error Whether the last sample had an error.\n")
|
||||
b.WriteString("# TYPE hecate_ups_error gauge\n")
|
||||
|
||||
names := make([]string, 0, len(e.samples))
|
||||
for name := range e.samples {
|
||||
names = append(names, name)
|
||||
}
|
||||
sort.Strings(names)
|
||||
for _, name := range names {
|
||||
s := e.samples[name]
|
||||
labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}",
|
||||
safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery)))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery)))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger)))
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_breach_count%s %d\n", labels, s.BreachCount))
|
||||
if s.UpdatedAt.IsZero() {
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s 0\n", labels))
|
||||
} else {
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix()))
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("hecate_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
|
||||
}
|
||||
|
||||
_, _ = w.Write([]byte(b.String()))
|
||||
}
|
||||
|
||||
func boolNum(v bool) int {
|
||||
if v {
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func safe(in string) string {
|
||||
out := strings.ReplaceAll(in, "\\", "\\\\")
|
||||
return strings.ReplaceAll(out, "\"", "\\\"")
|
||||
}
|
||||
44
internal/metrics/exporter_test.go
Normal file
44
internal/metrics/exporter_test.go
Normal file
@ -0,0 +1,44 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestExporterEmitsCoreMetrics(t *testing.T) {
|
||||
e := New()
|
||||
e.UpdateBudget(321)
|
||||
e.UpdateSample(Sample{
|
||||
Name: "db-ups",
|
||||
Target: "atlasups@localhost",
|
||||
OnBattery: true,
|
||||
LowBattery: false,
|
||||
RuntimeSecond: 412,
|
||||
ThresholdSec: 354,
|
||||
Trigger: true,
|
||||
BreachCount: 2,
|
||||
Status: "OB",
|
||||
UpdatedAt: time.Unix(1710000000, 0).UTC(),
|
||||
})
|
||||
e.MarkShutdown("ups-threshold")
|
||||
|
||||
req := httptest.NewRequest("GET", "/metrics", nil)
|
||||
rr := httptest.NewRecorder()
|
||||
e.Handler("/metrics").ServeHTTP(rr, req)
|
||||
body := rr.Body.String()
|
||||
|
||||
mustContain := []string{
|
||||
"hecate_shutdown_budget_seconds 321",
|
||||
"hecate_shutdown_triggers_total 1",
|
||||
"hecate_ups_on_battery{source=\"db-ups\",target=\"atlasups@localhost\"",
|
||||
"hecate_ups_runtime_seconds{source=\"db-ups\",target=\"atlasups@localhost\"",
|
||||
"hecate_ups_threshold_seconds{source=\"db-ups\",target=\"atlasups@localhost\"",
|
||||
}
|
||||
for _, m := range mustContain {
|
||||
if !strings.Contains(body, m) {
|
||||
t.Fatalf("missing metric fragment %q in output:\n%s", m, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -5,28 +5,48 @@ import (
|
||||
"fmt"
|
||||
"log"
|
||||
"math"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"scm.bstein.dev/bstein/hecate/internal/cluster"
|
||||
"scm.bstein.dev/bstein/hecate/internal/config"
|
||||
"scm.bstein.dev/bstein/hecate/internal/metrics"
|
||||
"scm.bstein.dev/bstein/hecate/internal/ups"
|
||||
)
|
||||
|
||||
type Target struct {
|
||||
Name string
|
||||
Target string
|
||||
Provider ups.Provider
|
||||
}
|
||||
|
||||
type Daemon struct {
|
||||
cfg config.Config
|
||||
orch *cluster.Orchestrator
|
||||
ups ups.Provider
|
||||
targets []Target
|
||||
log *log.Logger
|
||||
exporter *metrics.Exporter
|
||||
}
|
||||
|
||||
func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, provider ups.Provider, logger *log.Logger) *Daemon {
|
||||
return &Daemon{cfg: cfg, orch: orch, ups: provider, log: logger}
|
||||
func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, targets []Target, logger *log.Logger) *Daemon {
|
||||
return &Daemon{
|
||||
cfg: cfg,
|
||||
orch: orch,
|
||||
targets: targets,
|
||||
log: logger,
|
||||
exporter: metrics.New(),
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Daemon) Run(ctx context.Context) error {
|
||||
if !d.cfg.UPS.Enabled {
|
||||
return fmt.Errorf("ups monitoring is disabled in config")
|
||||
}
|
||||
if len(d.targets) == 0 {
|
||||
return fmt.Errorf("no UPS targets configured")
|
||||
}
|
||||
|
||||
poll := time.Duration(d.cfg.UPS.PollSeconds) * time.Second
|
||||
if poll <= 0 {
|
||||
@ -41,48 +61,85 @@ func (d *Daemon) Run(ctx context.Context) error {
|
||||
debounce = 3
|
||||
}
|
||||
|
||||
lastGood := time.Now()
|
||||
lastOnBattery := false
|
||||
breachCount := 0
|
||||
if d.cfg.Metrics.Enabled {
|
||||
if err := d.startMetricsServer(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
t := time.NewTicker(poll)
|
||||
defer t.Stop()
|
||||
|
||||
d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s", poll, debounce, telemetryTimeout)
|
||||
lastGood := map[string]time.Time{}
|
||||
lastOnBattery := map[string]bool{}
|
||||
breachCount := map[string]int{}
|
||||
for _, t := range d.targets {
|
||||
lastGood[t.Name] = time.Now()
|
||||
}
|
||||
|
||||
d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s",
|
||||
poll, debounce, telemetryTimeout, d.targetList())
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-t.C:
|
||||
sample, err := d.ups.Read(ctx)
|
||||
budget := d.orch.EstimatedShutdownSeconds()
|
||||
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
|
||||
|
||||
d.exporter.UpdateBudget(budget)
|
||||
|
||||
for _, target := range d.targets {
|
||||
sample, err := target.Provider.Read(ctx)
|
||||
if err != nil {
|
||||
d.log.Printf("warning: ups read failed: %v", err)
|
||||
if lastOnBattery && time.Since(lastGood) > telemetryTimeout {
|
||||
d.log.Printf("ups telemetry timeout while on battery, triggering shutdown")
|
||||
return d.triggerShutdown(ctx, "ups-telemetry-timeout")
|
||||
d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err)
|
||||
d.exporter.UpdateSample(metrics.Sample{
|
||||
Name: target.Name,
|
||||
Target: target.Target,
|
||||
ThresholdSec: threshold,
|
||||
BreachCount: breachCount[target.Name],
|
||||
LastError: err.Error(),
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
if lastOnBattery[target.Name] && time.Since(lastGood[target.Name]) > telemetryTimeout {
|
||||
d.log.Printf("ups telemetry timeout while on battery (target=%s), triggering shutdown", target.Name)
|
||||
reason := fmt.Sprintf("ups-telemetry-timeout target=%s", target.Name)
|
||||
return d.triggerShutdown(ctx, reason)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
lastGood = time.Now()
|
||||
lastOnBattery = sample.OnBattery
|
||||
lastGood[target.Name] = time.Now()
|
||||
lastOnBattery[target.Name] = sample.OnBattery
|
||||
|
||||
budget := d.orch.EstimatedShutdownSeconds()
|
||||
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
|
||||
trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold)
|
||||
|
||||
d.log.Printf("ups status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t",
|
||||
sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger)
|
||||
|
||||
if trigger {
|
||||
breachCount++
|
||||
if breachCount >= debounce {
|
||||
reason := fmt.Sprintf("ups-threshold runtime=%ds threshold=%ds status=%s", sample.RuntimeSeconds, threshold, sample.RawStatus)
|
||||
breachCount[target.Name]++
|
||||
} else {
|
||||
breachCount[target.Name] = 0
|
||||
}
|
||||
|
||||
d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
|
||||
target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
|
||||
|
||||
d.exporter.UpdateSample(metrics.Sample{
|
||||
Name: target.Name,
|
||||
Target: target.Target,
|
||||
OnBattery: sample.OnBattery,
|
||||
LowBattery: sample.LowBattery,
|
||||
RuntimeSecond: sample.RuntimeSeconds,
|
||||
ThresholdSec: threshold,
|
||||
Trigger: trigger,
|
||||
BreachCount: breachCount[target.Name],
|
||||
Status: sample.RawStatus,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
|
||||
if breachCount[target.Name] >= debounce {
|
||||
reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
|
||||
return d.triggerShutdown(ctx, reason)
|
||||
}
|
||||
} else {
|
||||
breachCount = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -90,5 +147,72 @@ func (d *Daemon) Run(ctx context.Context) error {
|
||||
|
||||
func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error {
|
||||
d.log.Printf("triggering shutdown: %s", reason)
|
||||
d.exporter.MarkShutdown(reason)
|
||||
if d.cfg.Coordination.ForwardShutdownHost != "" {
|
||||
if err := d.forwardShutdown(ctx, reason); err == nil {
|
||||
d.log.Printf("shutdown trigger forwarded to %s", d.cfg.Coordination.ForwardShutdownHost)
|
||||
return nil
|
||||
} else if !d.cfg.Coordination.FallbackLocalShutdown {
|
||||
return fmt.Errorf("forward shutdown failed and local fallback disabled: %w", err)
|
||||
} else {
|
||||
d.log.Printf("warning: forward shutdown failed; falling back to local shutdown: %v", err)
|
||||
}
|
||||
}
|
||||
return d.orch.Shutdown(ctx, cluster.ShutdownOptions{Reason: reason})
|
||||
}
|
||||
|
||||
func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
|
||||
userHost := d.cfg.Coordination.ForwardShutdownHost
|
||||
if d.cfg.Coordination.ForwardShutdownUser != "" {
|
||||
userHost = d.cfg.Coordination.ForwardShutdownUser + "@" + userHost
|
||||
}
|
||||
timeout := time.Duration(d.cfg.Coordination.CommandTimeoutSeconds) * time.Second
|
||||
if timeout <= 0 {
|
||||
timeout = 25 * time.Second
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
|
||||
remoteCmd := fmt.Sprintf(
|
||||
"sudo /usr/local/bin/hecate shutdown --config %q --execute --reason %q",
|
||||
d.cfg.Coordination.ForwardShutdownConfig,
|
||||
reason,
|
||||
)
|
||||
cmd := exec.CommandContext(runCtx, "ssh", "-o", "BatchMode=yes", "-o", "ConnectTimeout=8", userHost, remoteCmd)
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
trimmed := strings.TrimSpace(string(out))
|
||||
if trimmed == "" {
|
||||
return fmt.Errorf("forward shutdown via ssh failed: %w", err)
|
||||
}
|
||||
return fmt.Errorf("forward shutdown via ssh failed: %w: %s", err, trimmed)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (d *Daemon) targetList() string {
|
||||
names := make([]string, 0, len(d.targets))
|
||||
for _, t := range d.targets {
|
||||
names = append(names, t.Name+"="+t.Target)
|
||||
}
|
||||
return strings.Join(names, ",")
|
||||
}
|
||||
|
||||
func (d *Daemon) startMetricsServer() error {
|
||||
if d.cfg.Metrics.BindAddr == "" {
|
||||
return fmt.Errorf("metrics.bind_addr must not be empty when metrics are enabled")
|
||||
}
|
||||
handler := d.exporter.Handler(d.cfg.Metrics.Path)
|
||||
srv := &http.Server{
|
||||
Addr: d.cfg.Metrics.BindAddr,
|
||||
Handler: handler,
|
||||
ReadHeaderTimeout: 5 * time.Second,
|
||||
}
|
||||
go func() {
|
||||
d.log.Printf("metrics server listening on %s%s", d.cfg.Metrics.BindAddr, d.cfg.Metrics.Path)
|
||||
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
||||
d.log.Printf("warning: metrics server failed: %v", err)
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
26
scripts/hecate-self-update.sh
Normal file
26
scripts/hecate-self-update.sh
Normal file
@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
if [[ "${EUID}" -ne 0 ]]; then
|
||||
echo "hecate-self-update.sh must run as root" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REPO_URL="${HECATE_REPO_URL:-https://scm.bstein.dev/bstein/hecate.git}"
|
||||
BRANCH="${HECATE_REPO_BRANCH:-main}"
|
||||
REPO_DIR="${HECATE_REPO_DIR:-/opt/hecate}"
|
||||
|
||||
mkdir -p "$(dirname "${REPO_DIR}")"
|
||||
if [[ ! -d "${REPO_DIR}/.git" ]]; then
|
||||
echo "[self-update] cloning ${REPO_URL} into ${REPO_DIR}"
|
||||
git clone "${REPO_URL}" "${REPO_DIR}"
|
||||
fi
|
||||
|
||||
cd "${REPO_DIR}"
|
||||
echo "[self-update] syncing ${BRANCH}"
|
||||
git fetch origin --prune
|
||||
git checkout "${BRANCH}"
|
||||
git reset --hard "origin/${BRANCH}"
|
||||
|
||||
echo "[self-update] running installer"
|
||||
"${REPO_DIR}/scripts/install.sh"
|
||||
@ -11,7 +11,10 @@ BIN_DIR="/usr/local/bin"
|
||||
CONF_DIR="/etc/hecate"
|
||||
STATE_DIR="/var/lib/hecate"
|
||||
SYSTEMD_DIR="/etc/systemd/system"
|
||||
LIB_DIR="/usr/local/lib/hecate"
|
||||
START_NOW=1
|
||||
INSTALL_DEPS=1
|
||||
ENABLE_BOOTSTRAP="${HECATE_ENABLE_BOOTSTRAP:-0}"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
@ -19,6 +22,10 @@ while [[ $# -gt 0 ]]; do
|
||||
START_NOW=0
|
||||
shift
|
||||
;;
|
||||
--skip-deps)
|
||||
INSTALL_DEPS=0
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
exit 1
|
||||
@ -26,6 +33,59 @@ while [[ $# -gt 0 ]]; do
|
||||
esac
|
||||
done
|
||||
|
||||
ensure_apt_packages() {
|
||||
local missing=()
|
||||
for pkg in "$@"; do
|
||||
if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
|
||||
missing+=("${pkg}")
|
||||
fi
|
||||
done
|
||||
if [[ ${#missing[@]} -eq 0 ]]; then
|
||||
return 0
|
||||
fi
|
||||
echo "[install] apt install: ${missing[*]}"
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
apt-get update -y
|
||||
apt-get install -y "${missing[@]}"
|
||||
}
|
||||
|
||||
install_kubectl_if_missing() {
|
||||
if command -v kubectl >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
ensure_apt_packages kubernetes-client || true
|
||||
if command -v kubectl >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
echo "[install] installing kubectl via upstream binary"
|
||||
local arch
|
||||
arch="$(uname -m)"
|
||||
case "${arch}" in
|
||||
x86_64) arch="amd64" ;;
|
||||
aarch64|arm64) arch="arm64" ;;
|
||||
*) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
|
||||
esac
|
||||
local version
|
||||
version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
|
||||
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
|
||||
chmod 0755 /usr/local/bin/kubectl
|
||||
}
|
||||
|
||||
ensure_dependencies() {
|
||||
if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
|
||||
echo "[install] skipping dependency installation"
|
||||
return 0
|
||||
fi
|
||||
if ! command -v apt-get >/dev/null 2>&1; then
|
||||
echo "This installer currently supports apt-based hosts only." >&2
|
||||
exit 1
|
||||
fi
|
||||
ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
|
||||
install_kubectl_if_missing
|
||||
}
|
||||
|
||||
ensure_dependencies
|
||||
|
||||
echo "[install] building hecate"
|
||||
cd "${REPO_DIR}"
|
||||
mkdir -p dist
|
||||
@ -38,6 +98,7 @@ install -m 0755 dist/hecate "${BIN_DIR}/hecate"
|
||||
echo "[install] installing config + state dirs"
|
||||
install -d -m 0750 "${CONF_DIR}"
|
||||
install -d -m 0750 "${STATE_DIR}"
|
||||
install -d -m 0755 "${LIB_DIR}"
|
||||
if [[ ! -f "${CONF_DIR}/hecate.yaml" ]]; then
|
||||
install -m 0640 configs/hecate.example.yaml "${CONF_DIR}/hecate.yaml"
|
||||
echo "[install] wrote default config to ${CONF_DIR}/hecate.yaml"
|
||||
@ -48,12 +109,21 @@ fi
|
||||
echo "[install] installing systemd units"
|
||||
install -m 0644 deploy/systemd/hecate.service "${SYSTEMD_DIR}/hecate.service"
|
||||
install -m 0644 deploy/systemd/hecate-bootstrap.service "${SYSTEMD_DIR}/hecate-bootstrap.service"
|
||||
install -m 0644 deploy/systemd/hecate-update.service "${SYSTEMD_DIR}/hecate-update.service"
|
||||
install -m 0644 deploy/systemd/hecate-update.timer "${SYSTEMD_DIR}/hecate-update.timer"
|
||||
install -m 0755 scripts/hecate-self-update.sh "${LIB_DIR}/hecate-self-update.sh"
|
||||
|
||||
systemctl daemon-reload
|
||||
systemctl enable hecate.service hecate-bootstrap.service
|
||||
systemctl enable hecate.service hecate-update.timer
|
||||
if [[ "${ENABLE_BOOTSTRAP}" == "1" ]]; then
|
||||
systemctl enable hecate-bootstrap.service
|
||||
else
|
||||
systemctl disable hecate-bootstrap.service >/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
if [[ "${START_NOW}" -eq 1 ]]; then
|
||||
systemctl restart hecate.service
|
||||
systemctl restart hecate-update.timer
|
||||
echo "[install] hecate.service restarted"
|
||||
fi
|
||||
|
||||
@ -62,4 +132,5 @@ echo "Next steps:"
|
||||
echo " 1. Edit /etc/hecate/hecate.yaml"
|
||||
echo " 2. Run: hecate status --config /etc/hecate/hecate.yaml"
|
||||
echo " 3. Test dry run: hecate startup --config /etc/hecate/hecate.yaml"
|
||||
echo " 4. Trigger bootstrap now: systemctl start hecate-bootstrap.service"
|
||||
echo " 4. Trigger bootstrap now (db host): systemctl start hecate-bootstrap.service"
|
||||
echo " 5. Trigger self-update now: systemctl start hecate-update.service"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user