hecate: add multi-ups coordination, poweroff, metrics, and declarative self-update install

This commit is contained in:
Brad Stein 2026-04-03 14:46:03 -03:00
parent fbdb2c269b
commit 27c7d119c0
16 changed files with 864 additions and 65 deletions

View File

@ -6,6 +6,8 @@ It runs on `titan-db` and handles:
- Staged **startup** (including Flux/Gitea bootstrap deadlock fallback) - Staged **startup** (including Flux/Gitea bootstrap deadlock fallback)
- Graceful **shutdown** - Graceful **shutdown**
- UPS-driven automatic shutdown decisions based on discharge/runtime - UPS-driven automatic shutdown decisions based on discharge/runtime
- Multi-UPS operation via multiple Hecate instances (for example `titan-db` + `tethys`)
- Full hardware poweroff sequencing after graceful Kubernetes shutdown
## Why host-level ## Why host-level
@ -24,11 +26,17 @@ Hecate runs outside the cluster under systemd, so it can always orchestrate brin
```bash ```bash
git clone git@gitea-admin:bstein/hecate.git git clone git@gitea-admin:bstein/hecate.git
cd hecate cd hecate
sudo ./scripts/install.sh sudo HECATE_ENABLE_BOOTSTRAP=1 ./scripts/install.sh
sudoedit /etc/hecate/hecate.yaml sudoedit /etc/hecate/hecate.yaml
sudo systemctl restart hecate.service sudo systemctl restart hecate.service
``` ```
The installer is idempotent:
- Re-runs safely on every update
- Preserves existing `/etc/hecate/hecate.yaml`
- Ensures required dependencies are installed (`kubectl`, `nut-*`, `ssh`, `go`, etc.)
- Installs/refreshes systemd units and enables boot-time self-update
Bootstrap now (without reboot): Bootstrap now (without reboot):
```bash ```bash
@ -44,6 +52,13 @@ sudo systemctl start hecate-bootstrap.service
- `systemctl start/stop k3s-agent` - `systemctl start/stop k3s-agent`
- UPS telemetry available via NUT (`upsc`) - UPS telemetry available via NUT (`upsc`)
## Multi-UPS topology
Recommended:
- `titan-db` runs Hecate as the shutdown coordinator (local UPS target + local shutdown execution).
- `tethys` runs Hecate with local UPS target and forwards shutdown triggers to `titan-db`.
- If forwarding fails, fallback local shutdown can remain enabled.
## Config ## Config
See `configs/hecate.example.yaml`. See `configs/hecate.example.yaml`.
@ -55,7 +70,13 @@ UPS auto-shutdown trigger uses:
Estimated shutdown budget is derived from historical successful shutdown runs (`/var/lib/hecate/runs.json`) with default fallback from config. Estimated shutdown budget is derived from historical successful shutdown runs (`/var/lib/hecate/runs.json`) with default fallback from config.
Power metrics:
- Hecate exposes Prometheus metrics on `:9560/metrics` by default.
- This is intended for a dedicated Grafana power dashboard and a high-level overview row.
## Notes ## Notes
- Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set. - Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set.
- `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically. - `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically.
- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts).
- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively.

View File

@ -84,6 +84,7 @@ func runShutdown(logger *log.Logger, args []string) error {
execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)") execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)")
skipEtcd := fs.Bool("skip-etcd-snapshot", false, "Skip etcd snapshot") skipEtcd := fs.Bool("skip-etcd-snapshot", false, "Skip etcd snapshot")
skipDrain := fs.Bool("skip-drain", false, "Skip worker drain") skipDrain := fs.Bool("skip-drain", false, "Skip worker drain")
reason := fs.String("reason", "manual-shutdown", "Shutdown reason for run history")
_ = fs.Parse(args) _ = fs.Parse(args)
_, orch, err := buildOrchestrator(logger, *configPath, !*execute) _, orch, err := buildOrchestrator(logger, *configPath, !*execute)
@ -96,7 +97,7 @@ func runShutdown(logger *log.Logger, args []string) error {
return orch.Shutdown(ctx, cluster.ShutdownOptions{ return orch.Shutdown(ctx, cluster.ShutdownOptions{
SkipEtcdSnapshot: *skipEtcd, SkipEtcdSnapshot: *skipEtcd,
SkipDrain: *skipDrain, SkipDrain: *skipDrain,
Reason: "manual-shutdown", Reason: *reason,
}) })
} }
@ -113,15 +114,37 @@ func runDaemon(logger *log.Logger, args []string) error {
if !cfg.UPS.Enabled { if !cfg.UPS.Enabled {
return fmt.Errorf("UPS monitoring is disabled in config") return fmt.Errorf("UPS monitoring is disabled in config")
} }
var provider ups.Provider targets := make([]service.Target, 0, len(cfg.UPS.Targets)+1)
switch cfg.UPS.Provider { switch cfg.UPS.Provider {
case "nut": case "nut":
provider = ups.NewNUTProvider(cfg.UPS.Target) if len(cfg.UPS.Targets) == 0 {
target := cfg.UPS.Target
if target == "" {
return fmt.Errorf("ups target must be set")
}
targets = append(targets, service.Target{
Name: "primary",
Target: target,
Provider: ups.NewNUTProvider(target),
})
} else {
for idx, t := range cfg.UPS.Targets {
name := t.Name
if name == "" {
name = fmt.Sprintf("target-%d", idx+1)
}
targets = append(targets, service.Target{
Name: name,
Target: t.Target,
Provider: ups.NewNUTProvider(t.Target),
})
}
}
default: default:
return fmt.Errorf("unsupported UPS provider: %s", cfg.UPS.Provider) return fmt.Errorf("unsupported UPS provider: %s", cfg.UPS.Provider)
} }
d := service.NewDaemon(cfg, orch, provider, logger) d := service.NewDaemon(cfg, orch, targets, logger)
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
defer stop() defer stop()
return d.Run(ctx) return d.Run(ctx)
@ -184,7 +207,7 @@ Commands:
Examples: Examples:
hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main
hecate shutdown --config /etc/hecate/hecate.yaml --execute hecate shutdown --config /etc/hecate/hecate.yaml --execute --reason "manual-maintenance"
hecate daemon --config /etc/hecate/hecate.yaml hecate daemon --config /etc/hecate/hecate.yaml
hecate status --config /etc/hecate/hecate.yaml hecate status --config /etc/hecate/hecate.yaml
`) `)

View File

@ -34,14 +34,32 @@ shutdown:
default_budget_seconds: 300 default_budget_seconds: 300
skip_etcd_snapshot: false skip_etcd_snapshot: false
skip_drain: false skip_drain: false
poweroff_enabled: true
poweroff_delay_seconds: 25
poweroff_local_host: true
extra_poweroff_hosts:
- titan-db
ups: ups:
enabled: true enabled: true
provider: nut provider: nut
target: atlasups@localhost target: atlasups@localhost
targets:
- name: db-ups
target: atlasups@localhost
poll_seconds: 5 poll_seconds: 5
runtime_safety_factor: 1.10 runtime_safety_factor: 1.10
debounce_count: 3 debounce_count: 3
telemetry_timeout_seconds: 90 telemetry_timeout_seconds: 90
coordination:
forward_shutdown_host: ""
forward_shutdown_user: atlas
forward_shutdown_config: /etc/hecate/hecate.yaml
fallback_local_shutdown: true
command_timeout_seconds: 25
metrics:
enabled: true
bind_addr: 0.0.0.0:9560
path: /metrics
state: state:
dir: /var/lib/hecate dir: /var/lib/hecate
run_history_path: /var/lib/hecate/runs.json run_history_path: /var/lib/hecate/runs.json

View File

@ -0,0 +1,57 @@
# /etc/hecate/hecate.yaml for titan-24 (tethys forwarder)
kubeconfig: /home/tethys/.kube/config
ssh_user: atlas
iac_repo_path: /opt/titan-iac
expected_flux_branch: main
control_planes:
- titan-0a
- titan-0b
- titan-0c
workers: []
local_bootstrap_paths:
- infrastructure/core
excluded_namespaces:
- kube-system
- kube-public
- kube-node-lease
- flux-system
- traefik
- metallb-system
- cert-manager
- longhorn-system
- vault
- postgres
- maintenance
shutdown:
default_budget_seconds: 300
skip_etcd_snapshot: false
skip_drain: false
poweroff_enabled: true
poweroff_delay_seconds: 25
poweroff_local_host: true
extra_poweroff_hosts: []
ups:
enabled: true
provider: nut
targets:
- name: tethys-ups
target: atlasups@localhost
poll_seconds: 5
runtime_safety_factor: 1.10
debounce_count: 3
telemetry_timeout_seconds: 90
coordination:
forward_shutdown_host: titan-db
forward_shutdown_user: atlas
forward_shutdown_config: /etc/hecate/hecate.yaml
fallback_local_shutdown: false
command_timeout_seconds: 25
metrics:
enabled: true
bind_addr: 0.0.0.0:9560
path: /metrics
state:
dir: /var/lib/hecate
run_history_path: /var/lib/hecate/runs.json
lock_path: /var/lib/hecate/hecate.lock

View File

@ -0,0 +1,66 @@
# /etc/hecate/hecate.yaml for titan-db (coordinator)
kubeconfig: /home/atlas/.kube/config
ssh_user: atlas
iac_repo_path: /opt/titan-iac
expected_flux_branch: main
control_planes:
- titan-0a
- titan-0b
- titan-0c
workers: []
local_bootstrap_paths:
- infrastructure/core
- infrastructure/sources/helm
- infrastructure/metallb
- infrastructure/traefik
- infrastructure/vault-csi
- infrastructure/vault-injector
- services/vault
- infrastructure/postgres
- services/gitea
excluded_namespaces:
- kube-system
- kube-public
- kube-node-lease
- flux-system
- traefik
- metallb-system
- cert-manager
- longhorn-system
- vault
- postgres
- maintenance
shutdown:
default_budget_seconds: 300
skip_etcd_snapshot: false
skip_drain: false
poweroff_enabled: true
poweroff_delay_seconds: 25
poweroff_local_host: true
extra_poweroff_hosts:
- titan-db
ups:
enabled: true
provider: nut
targets:
- name: db-ups
target: atlasups@localhost
poll_seconds: 5
runtime_safety_factor: 1.10
debounce_count: 3
telemetry_timeout_seconds: 90
coordination:
forward_shutdown_host: ""
forward_shutdown_user: atlas
forward_shutdown_config: /etc/hecate/hecate.yaml
fallback_local_shutdown: true
command_timeout_seconds: 25
metrics:
enabled: true
bind_addr: 0.0.0.0:9560
path: /metrics
state:
dir: /var/lib/hecate
run_history_path: /var/lib/hecate/runs.json
lock_path: /var/lib/hecate/hecate.lock

View File

@ -0,0 +1,12 @@
[Unit]
Description=Hecate Self-Update and Reinstall
Wants=network-online.target
After=network-online.target
[Service]
Type=oneshot
User=root
Group=root
ExecStart=/usr/local/lib/hecate/hecate-self-update.sh
TimeoutStartSec=1800

View File

@ -0,0 +1,12 @@
[Unit]
Description=Periodic Hecate Self-Update Timer
[Timer]
OnBootSec=2m
OnUnitActiveSec=6h
Unit=hecate-update.service
Persistent=true
[Install]
WantedBy=timers.target

View File

@ -9,7 +9,7 @@ Type=simple
User=root User=root
Group=root Group=root
ExecStart=/usr/local/bin/hecate daemon --config /etc/hecate/hecate.yaml ExecStart=/usr/local/bin/hecate daemon --config /etc/hecate/hecate.yaml
Restart=always Restart=on-failure
RestartSec=5 RestartSec=5
NoNewPrivileges=true NoNewPrivileges=true

View File

@ -4,7 +4,9 @@ import (
"context" "context"
"fmt" "fmt"
"log" "log"
"os"
"path/filepath" "path/filepath"
"sort"
"strings" "strings"
"time" "time"
@ -132,6 +134,9 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err
o.stopWorkers(ctx, workers) o.stopWorkers(ctx, workers)
o.stopControlPlanes(ctx, o.cfg.ControlPlanes) o.stopControlPlanes(ctx, o.cfg.ControlPlanes)
if o.cfg.Shutdown.PoweroffEnabled {
o.bestEffort("poweroff hosts", func() error { return o.poweroffHosts(ctx, workers) })
}
o.log.Printf("shutdown flow complete") o.log.Printf("shutdown flow complete")
return nil return nil
} }
@ -413,3 +418,60 @@ func (o *Orchestrator) bestEffort(name string, fn func() error) {
o.log.Printf("warning: %s: %v", name, err) o.log.Printf("warning: %s: %v", name, err)
} }
} }
func (o *Orchestrator) poweroffHosts(ctx context.Context, workers []string) error {
delay := o.cfg.Shutdown.PoweroffDelaySeconds
if delay <= 0 {
delay = 25
}
localNames := map[string]struct{}{}
if hn, err := os.Hostname(); err == nil && strings.TrimSpace(hn) != "" {
localNames[strings.TrimSpace(hn)] = struct{}{}
}
if o.cfg.SSHUser != "" {
localNames[o.cfg.SSHUser] = struct{}{}
}
hostSet := map[string]struct{}{}
for _, n := range o.cfg.ControlPlanes {
hostSet[n] = struct{}{}
}
for _, n := range workers {
hostSet[n] = struct{}{}
}
for _, n := range o.cfg.Shutdown.ExtraPoweroffHosts {
if strings.TrimSpace(n) != "" {
hostSet[strings.TrimSpace(n)] = struct{}{}
}
}
hosts := make([]string, 0, len(hostSet))
for h := range hostSet {
hosts = append(hosts, h)
}
sort.Strings(hosts)
remoteCmd := fmt.Sprintf(`sudo nohup sh -c 'sleep %d; systemctl poweroff' >/dev/null 2>&1 &`, delay)
for _, host := range hosts {
host = strings.TrimSpace(host)
if host == "" {
continue
}
if _, isLocal := localNames[host]; isLocal {
continue
}
o.bestEffort("schedule poweroff on "+host, func() error {
_, err := o.ssh(ctx, host, remoteCmd)
return err
})
}
if o.cfg.Shutdown.PoweroffLocalHost {
o.bestEffort("schedule local host poweroff", func() error {
_, err := o.run(ctx, 5*time.Second, "sh", "-c", fmt.Sprintf("nohup sh -c 'sleep %d; systemctl poweroff' >/dev/null 2>&1 &", delay+10))
return err
})
}
return nil
}

View File

@ -18,6 +18,8 @@ type Config struct {
ExcludedNamespaces []string `yaml:"excluded_namespaces"` ExcludedNamespaces []string `yaml:"excluded_namespaces"`
Shutdown Shutdown `yaml:"shutdown"` Shutdown Shutdown `yaml:"shutdown"`
UPS UPS `yaml:"ups"` UPS UPS `yaml:"ups"`
Coordination Coordination `yaml:"coordination"`
Metrics Metrics `yaml:"metrics"`
State State `yaml:"state"` State State `yaml:"state"`
} }
@ -25,18 +27,42 @@ type Shutdown struct {
DefaultBudgetSeconds int `yaml:"default_budget_seconds"` DefaultBudgetSeconds int `yaml:"default_budget_seconds"`
SkipEtcdSnapshot bool `yaml:"skip_etcd_snapshot"` SkipEtcdSnapshot bool `yaml:"skip_etcd_snapshot"`
SkipDrain bool `yaml:"skip_drain"` SkipDrain bool `yaml:"skip_drain"`
PoweroffEnabled bool `yaml:"poweroff_enabled"`
PoweroffDelaySeconds int `yaml:"poweroff_delay_seconds"`
PoweroffLocalHost bool `yaml:"poweroff_local_host"`
ExtraPoweroffHosts []string `yaml:"extra_poweroff_hosts"`
} }
type UPS struct { type UPS struct {
Enabled bool `yaml:"enabled"` Enabled bool `yaml:"enabled"`
Provider string `yaml:"provider"` Provider string `yaml:"provider"`
Target string `yaml:"target"` Target string `yaml:"target"`
Targets []UPSTarget `yaml:"targets"`
PollSeconds int `yaml:"poll_seconds"` PollSeconds int `yaml:"poll_seconds"`
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"` RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
DebounceCount int `yaml:"debounce_count"` DebounceCount int `yaml:"debounce_count"`
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"` TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
} }
type UPSTarget struct {
Name string `yaml:"name"`
Target string `yaml:"target"`
}
type Coordination struct {
ForwardShutdownHost string `yaml:"forward_shutdown_host"`
ForwardShutdownUser string `yaml:"forward_shutdown_user"`
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
}
type Metrics struct {
Enabled bool `yaml:"enabled"`
BindAddr string `yaml:"bind_addr"`
Path string `yaml:"path"`
}
type State struct { type State struct {
Dir string `yaml:"dir"` Dir string `yaml:"dir"`
RunHistoryPath string `yaml:"run_history_path"` RunHistoryPath string `yaml:"run_history_path"`
@ -78,8 +104,18 @@ func (c Config) Validate() error {
if c.UPS.Provider == "" { if c.UPS.Provider == "" {
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled") return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
} }
if c.UPS.Target == "" { if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
return fmt.Errorf("config.ups.target must not be empty when ups is enabled") return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
}
for _, t := range c.UPS.Targets {
if t.Target == "" {
return fmt.Errorf("config.ups.targets[].target must not be empty")
}
}
}
if c.Coordination.ForwardShutdownHost != "" {
if c.Coordination.ForwardShutdownConfig == "" {
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
} }
} }
if c.State.RunHistoryPath == "" || c.State.LockPath == "" { if c.State.RunHistoryPath == "" || c.State.LockPath == "" {
@ -119,6 +155,9 @@ func defaults() Config {
}, },
Shutdown: Shutdown{ Shutdown: Shutdown{
DefaultBudgetSeconds: 300, DefaultBudgetSeconds: 300,
PoweroffEnabled: true,
PoweroffDelaySeconds: 25,
PoweroffLocalHost: true,
}, },
UPS: UPS{ UPS: UPS{
Enabled: true, Enabled: true,
@ -128,6 +167,16 @@ func defaults() Config {
DebounceCount: 3, DebounceCount: 3,
TelemetryTimeoutSeconds: 90, TelemetryTimeoutSeconds: 90,
}, },
Coordination: Coordination{
ForwardShutdownConfig: "/etc/hecate/hecate.yaml",
FallbackLocalShutdown: true,
CommandTimeoutSeconds: 25,
},
Metrics: Metrics{
Enabled: true,
BindAddr: "0.0.0.0:9560",
Path: "/metrics",
},
State: State{ State: State{
Dir: "/var/lib/hecate", Dir: "/var/lib/hecate",
RunHistoryPath: "/var/lib/hecate/runs.json", RunHistoryPath: "/var/lib/hecate/runs.json",
@ -148,6 +197,9 @@ func (c *Config) applyDefaults() {
if c.Shutdown.DefaultBudgetSeconds <= 0 { if c.Shutdown.DefaultBudgetSeconds <= 0 {
c.Shutdown.DefaultBudgetSeconds = 300 c.Shutdown.DefaultBudgetSeconds = 300
} }
if c.Shutdown.PoweroffDelaySeconds <= 0 {
c.Shutdown.PoweroffDelaySeconds = 25
}
if c.UPS.PollSeconds <= 0 { if c.UPS.PollSeconds <= 0 {
c.UPS.PollSeconds = 5 c.UPS.PollSeconds = 5
} }
@ -160,6 +212,18 @@ func (c *Config) applyDefaults() {
if c.UPS.TelemetryTimeoutSeconds <= 0 { if c.UPS.TelemetryTimeoutSeconds <= 0 {
c.UPS.TelemetryTimeoutSeconds = 90 c.UPS.TelemetryTimeoutSeconds = 90
} }
if c.Coordination.ForwardShutdownConfig == "" {
c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml"
}
if c.Coordination.CommandTimeoutSeconds <= 0 {
c.Coordination.CommandTimeoutSeconds = 25
}
if c.Metrics.BindAddr == "" {
c.Metrics.BindAddr = "0.0.0.0:9560"
}
if c.Metrics.Path == "" {
c.Metrics.Path = "/metrics"
}
if c.State.Dir == "" { if c.State.Dir == "" {
c.State.Dir = "/var/lib/hecate" c.State.Dir = "/var/lib/hecate"
} }

View File

@ -0,0 +1,49 @@
package config
import (
"os"
"path/filepath"
"strings"
"testing"
)
func TestLoadAcceptsUPSTargets(t *testing.T) {
tmp := t.TempDir()
cfgPath := filepath.Join(tmp, "hecate.yaml")
raw := `
control_planes: [titan-0a, titan-0b, titan-0c]
expected_flux_branch: main
iac_repo_path: /opt/titan-iac
ups:
enabled: true
provider: nut
targets:
- name: db
target: atlasups@localhost
shutdown:
default_budget_seconds: 300
state:
run_history_path: /tmp/runs.json
lock_path: /tmp/hecate.lock
`
if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil {
t.Fatalf("write config: %v", err)
}
cfg, err := Load(cfgPath)
if err != nil {
t.Fatalf("load config: %v", err)
}
if len(cfg.UPS.Targets) != 1 || cfg.UPS.Targets[0].Target != "atlasups@localhost" {
t.Fatalf("unexpected UPS targets: %#v", cfg.UPS.Targets)
}
}
func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) {
cfg := defaults()
cfg.Coordination.ForwardShutdownHost = "titan-db"
cfg.Coordination.ForwardShutdownConfig = ""
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for missing forward_shutdown_config")
}
}

View File

@ -0,0 +1,150 @@
package metrics
import (
"fmt"
"net/http"
"sort"
"strings"
"sync"
"time"
)
type Sample struct {
Name string
Target string
OnBattery bool
LowBattery bool
RuntimeSecond int
ThresholdSec int
Trigger bool
BreachCount int
Status string
LastError string
UpdatedAt time.Time
}
type Exporter struct {
mu sync.RWMutex
shutdownBudgetSec int
shutdownTriggers int
lastShutdownReason string
lastShutdownAt time.Time
samples map[string]Sample
}
func New() *Exporter {
return &Exporter{
samples: make(map[string]Sample),
}
}
func (e *Exporter) UpdateBudget(seconds int) {
e.mu.Lock()
defer e.mu.Unlock()
e.shutdownBudgetSec = seconds
}
func (e *Exporter) UpdateSample(s Sample) {
e.mu.Lock()
defer e.mu.Unlock()
if s.UpdatedAt.IsZero() {
s.UpdatedAt = time.Now().UTC()
}
e.samples[s.Name] = s
}
func (e *Exporter) MarkShutdown(reason string) {
e.mu.Lock()
defer e.mu.Unlock()
e.shutdownTriggers++
e.lastShutdownReason = reason
e.lastShutdownAt = time.Now().UTC()
}
func (e *Exporter) Handler(path string) http.Handler {
mux := http.NewServeMux()
metricsPath := path
if metricsPath == "" {
metricsPath = "/metrics"
}
mux.HandleFunc(metricsPath, e.serveMetrics)
mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("ok\n"))
})
return mux
}
func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) {
e.mu.RLock()
defer e.mu.RUnlock()
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
var b strings.Builder
b.WriteString("# HELP hecate_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n")
b.WriteString("# TYPE hecate_shutdown_budget_seconds gauge\n")
b.WriteString(fmt.Sprintf("hecate_shutdown_budget_seconds %d\n", e.shutdownBudgetSec))
b.WriteString("# HELP hecate_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n")
b.WriteString("# TYPE hecate_shutdown_triggers_total counter\n")
b.WriteString(fmt.Sprintf("hecate_shutdown_triggers_total %d\n", e.shutdownTriggers))
b.WriteString("# HELP hecate_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n")
b.WriteString("# TYPE hecate_shutdown_last_trigger_timestamp_seconds gauge\n")
if e.lastShutdownAt.IsZero() {
b.WriteString("hecate_shutdown_last_trigger_timestamp_seconds 0\n")
} else {
b.WriteString(fmt.Sprintf("hecate_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix()))
}
b.WriteString("# HELP hecate_ups_on_battery Whether a UPS source is currently on battery.\n")
b.WriteString("# TYPE hecate_ups_on_battery gauge\n")
b.WriteString("# HELP hecate_ups_low_battery Whether a UPS source currently reports low battery.\n")
b.WriteString("# TYPE hecate_ups_low_battery gauge\n")
b.WriteString("# HELP hecate_ups_runtime_seconds Battery runtime remaining reported by UPS.\n")
b.WriteString("# TYPE hecate_ups_runtime_seconds gauge\n")
b.WriteString("# HELP hecate_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n")
b.WriteString("# TYPE hecate_ups_threshold_seconds gauge\n")
b.WriteString("# HELP hecate_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n")
b.WriteString("# TYPE hecate_ups_trigger_active gauge\n")
b.WriteString("# HELP hecate_ups_breach_count Current debounce breach count for this UPS source.\n")
b.WriteString("# TYPE hecate_ups_breach_count gauge\n")
b.WriteString("# HELP hecate_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n")
b.WriteString("# TYPE hecate_ups_last_sample_timestamp_seconds gauge\n")
b.WriteString("# HELP hecate_ups_error Whether the last sample had an error.\n")
b.WriteString("# TYPE hecate_ups_error gauge\n")
names := make([]string, 0, len(e.samples))
for name := range e.samples {
names = append(names, name)
}
sort.Strings(names)
for _, name := range names {
s := e.samples[name]
labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}",
safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason))
b.WriteString(fmt.Sprintf("hecate_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery)))
b.WriteString(fmt.Sprintf("hecate_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery)))
b.WriteString(fmt.Sprintf("hecate_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond))
b.WriteString(fmt.Sprintf("hecate_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec))
b.WriteString(fmt.Sprintf("hecate_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger)))
b.WriteString(fmt.Sprintf("hecate_ups_breach_count%s %d\n", labels, s.BreachCount))
if s.UpdatedAt.IsZero() {
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s 0\n", labels))
} else {
b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix()))
}
b.WriteString(fmt.Sprintf("hecate_ups_error%s %d\n", labels, boolNum(s.LastError != "")))
}
_, _ = w.Write([]byte(b.String()))
}
func boolNum(v bool) int {
if v {
return 1
}
return 0
}
func safe(in string) string {
out := strings.ReplaceAll(in, "\\", "\\\\")
return strings.ReplaceAll(out, "\"", "\\\"")
}

View File

@ -0,0 +1,44 @@
package metrics
import (
"net/http/httptest"
"strings"
"testing"
"time"
)
func TestExporterEmitsCoreMetrics(t *testing.T) {
e := New()
e.UpdateBudget(321)
e.UpdateSample(Sample{
Name: "db-ups",
Target: "atlasups@localhost",
OnBattery: true,
LowBattery: false,
RuntimeSecond: 412,
ThresholdSec: 354,
Trigger: true,
BreachCount: 2,
Status: "OB",
UpdatedAt: time.Unix(1710000000, 0).UTC(),
})
e.MarkShutdown("ups-threshold")
req := httptest.NewRequest("GET", "/metrics", nil)
rr := httptest.NewRecorder()
e.Handler("/metrics").ServeHTTP(rr, req)
body := rr.Body.String()
mustContain := []string{
"hecate_shutdown_budget_seconds 321",
"hecate_shutdown_triggers_total 1",
"hecate_ups_on_battery{source=\"db-ups\",target=\"atlasups@localhost\"",
"hecate_ups_runtime_seconds{source=\"db-ups\",target=\"atlasups@localhost\"",
"hecate_ups_threshold_seconds{source=\"db-ups\",target=\"atlasups@localhost\"",
}
for _, m := range mustContain {
if !strings.Contains(body, m) {
t.Fatalf("missing metric fragment %q in output:\n%s", m, body)
}
}
}

View File

@ -5,28 +5,48 @@ import (
"fmt" "fmt"
"log" "log"
"math" "math"
"net/http"
"os/exec"
"strings"
"time" "time"
"scm.bstein.dev/bstein/hecate/internal/cluster" "scm.bstein.dev/bstein/hecate/internal/cluster"
"scm.bstein.dev/bstein/hecate/internal/config" "scm.bstein.dev/bstein/hecate/internal/config"
"scm.bstein.dev/bstein/hecate/internal/metrics"
"scm.bstein.dev/bstein/hecate/internal/ups" "scm.bstein.dev/bstein/hecate/internal/ups"
) )
type Target struct {
Name string
Target string
Provider ups.Provider
}
type Daemon struct { type Daemon struct {
cfg config.Config cfg config.Config
orch *cluster.Orchestrator orch *cluster.Orchestrator
ups ups.Provider targets []Target
log *log.Logger log *log.Logger
exporter *metrics.Exporter
} }
func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, provider ups.Provider, logger *log.Logger) *Daemon { func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, targets []Target, logger *log.Logger) *Daemon {
return &Daemon{cfg: cfg, orch: orch, ups: provider, log: logger} return &Daemon{
cfg: cfg,
orch: orch,
targets: targets,
log: logger,
exporter: metrics.New(),
}
} }
func (d *Daemon) Run(ctx context.Context) error { func (d *Daemon) Run(ctx context.Context) error {
if !d.cfg.UPS.Enabled { if !d.cfg.UPS.Enabled {
return fmt.Errorf("ups monitoring is disabled in config") return fmt.Errorf("ups monitoring is disabled in config")
} }
if len(d.targets) == 0 {
return fmt.Errorf("no UPS targets configured")
}
poll := time.Duration(d.cfg.UPS.PollSeconds) * time.Second poll := time.Duration(d.cfg.UPS.PollSeconds) * time.Second
if poll <= 0 { if poll <= 0 {
@ -41,48 +61,85 @@ func (d *Daemon) Run(ctx context.Context) error {
debounce = 3 debounce = 3
} }
lastGood := time.Now() if d.cfg.Metrics.Enabled {
lastOnBattery := false if err := d.startMetricsServer(); err != nil {
breachCount := 0 return err
}
}
t := time.NewTicker(poll) t := time.NewTicker(poll)
defer t.Stop() defer t.Stop()
d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s", poll, debounce, telemetryTimeout) lastGood := map[string]time.Time{}
lastOnBattery := map[string]bool{}
breachCount := map[string]int{}
for _, t := range d.targets {
lastGood[t.Name] = time.Now()
}
d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s",
poll, debounce, telemetryTimeout, d.targetList())
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
return ctx.Err() return ctx.Err()
case <-t.C: case <-t.C:
sample, err := d.ups.Read(ctx) budget := d.orch.EstimatedShutdownSeconds()
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
d.exporter.UpdateBudget(budget)
for _, target := range d.targets {
sample, err := target.Provider.Read(ctx)
if err != nil { if err != nil {
d.log.Printf("warning: ups read failed: %v", err) d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err)
if lastOnBattery && time.Since(lastGood) > telemetryTimeout { d.exporter.UpdateSample(metrics.Sample{
d.log.Printf("ups telemetry timeout while on battery, triggering shutdown") Name: target.Name,
return d.triggerShutdown(ctx, "ups-telemetry-timeout") Target: target.Target,
ThresholdSec: threshold,
BreachCount: breachCount[target.Name],
LastError: err.Error(),
UpdatedAt: time.Now().UTC(),
})
if lastOnBattery[target.Name] && time.Since(lastGood[target.Name]) > telemetryTimeout {
d.log.Printf("ups telemetry timeout while on battery (target=%s), triggering shutdown", target.Name)
reason := fmt.Sprintf("ups-telemetry-timeout target=%s", target.Name)
return d.triggerShutdown(ctx, reason)
} }
continue continue
} }
lastGood = time.Now() lastGood[target.Name] = time.Now()
lastOnBattery = sample.OnBattery lastOnBattery[target.Name] = sample.OnBattery
budget := d.orch.EstimatedShutdownSeconds()
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold) trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold)
d.log.Printf("ups status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t",
sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger)
if trigger { if trigger {
breachCount++ breachCount[target.Name]++
if breachCount >= debounce { } else {
reason := fmt.Sprintf("ups-threshold runtime=%ds threshold=%ds status=%s", sample.RuntimeSeconds, threshold, sample.RawStatus) breachCount[target.Name] = 0
}
d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
d.exporter.UpdateSample(metrics.Sample{
Name: target.Name,
Target: target.Target,
OnBattery: sample.OnBattery,
LowBattery: sample.LowBattery,
RuntimeSecond: sample.RuntimeSeconds,
ThresholdSec: threshold,
Trigger: trigger,
BreachCount: breachCount[target.Name],
Status: sample.RawStatus,
UpdatedAt: time.Now().UTC(),
})
if breachCount[target.Name] >= debounce {
reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
return d.triggerShutdown(ctx, reason) return d.triggerShutdown(ctx, reason)
} }
} else {
breachCount = 0
} }
} }
} }
@ -90,5 +147,72 @@ func (d *Daemon) Run(ctx context.Context) error {
func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error { func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error {
d.log.Printf("triggering shutdown: %s", reason) d.log.Printf("triggering shutdown: %s", reason)
d.exporter.MarkShutdown(reason)
if d.cfg.Coordination.ForwardShutdownHost != "" {
if err := d.forwardShutdown(ctx, reason); err == nil {
d.log.Printf("shutdown trigger forwarded to %s", d.cfg.Coordination.ForwardShutdownHost)
return nil
} else if !d.cfg.Coordination.FallbackLocalShutdown {
return fmt.Errorf("forward shutdown failed and local fallback disabled: %w", err)
} else {
d.log.Printf("warning: forward shutdown failed; falling back to local shutdown: %v", err)
}
}
return d.orch.Shutdown(ctx, cluster.ShutdownOptions{Reason: reason}) return d.orch.Shutdown(ctx, cluster.ShutdownOptions{Reason: reason})
} }
func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
userHost := d.cfg.Coordination.ForwardShutdownHost
if d.cfg.Coordination.ForwardShutdownUser != "" {
userHost = d.cfg.Coordination.ForwardShutdownUser + "@" + userHost
}
timeout := time.Duration(d.cfg.Coordination.CommandTimeoutSeconds) * time.Second
if timeout <= 0 {
timeout = 25 * time.Second
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
remoteCmd := fmt.Sprintf(
"sudo /usr/local/bin/hecate shutdown --config %q --execute --reason %q",
d.cfg.Coordination.ForwardShutdownConfig,
reason,
)
cmd := exec.CommandContext(runCtx, "ssh", "-o", "BatchMode=yes", "-o", "ConnectTimeout=8", userHost, remoteCmd)
out, err := cmd.CombinedOutput()
if err != nil {
trimmed := strings.TrimSpace(string(out))
if trimmed == "" {
return fmt.Errorf("forward shutdown via ssh failed: %w", err)
}
return fmt.Errorf("forward shutdown via ssh failed: %w: %s", err, trimmed)
}
return nil
}
func (d *Daemon) targetList() string {
names := make([]string, 0, len(d.targets))
for _, t := range d.targets {
names = append(names, t.Name+"="+t.Target)
}
return strings.Join(names, ",")
}
func (d *Daemon) startMetricsServer() error {
if d.cfg.Metrics.BindAddr == "" {
return fmt.Errorf("metrics.bind_addr must not be empty when metrics are enabled")
}
handler := d.exporter.Handler(d.cfg.Metrics.Path)
srv := &http.Server{
Addr: d.cfg.Metrics.BindAddr,
Handler: handler,
ReadHeaderTimeout: 5 * time.Second,
}
go func() {
d.log.Printf("metrics server listening on %s%s", d.cfg.Metrics.BindAddr, d.cfg.Metrics.Path)
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
d.log.Printf("warning: metrics server failed: %v", err)
}
}()
return nil
}

View File

@ -0,0 +1,26 @@
#!/usr/bin/env bash
set -euo pipefail
if [[ "${EUID}" -ne 0 ]]; then
echo "hecate-self-update.sh must run as root" >&2
exit 1
fi
REPO_URL="${HECATE_REPO_URL:-https://scm.bstein.dev/bstein/hecate.git}"
BRANCH="${HECATE_REPO_BRANCH:-main}"
REPO_DIR="${HECATE_REPO_DIR:-/opt/hecate}"
mkdir -p "$(dirname "${REPO_DIR}")"
if [[ ! -d "${REPO_DIR}/.git" ]]; then
echo "[self-update] cloning ${REPO_URL} into ${REPO_DIR}"
git clone "${REPO_URL}" "${REPO_DIR}"
fi
cd "${REPO_DIR}"
echo "[self-update] syncing ${BRANCH}"
git fetch origin --prune
git checkout "${BRANCH}"
git reset --hard "origin/${BRANCH}"
echo "[self-update] running installer"
"${REPO_DIR}/scripts/install.sh"

View File

@ -11,7 +11,10 @@ BIN_DIR="/usr/local/bin"
CONF_DIR="/etc/hecate" CONF_DIR="/etc/hecate"
STATE_DIR="/var/lib/hecate" STATE_DIR="/var/lib/hecate"
SYSTEMD_DIR="/etc/systemd/system" SYSTEMD_DIR="/etc/systemd/system"
LIB_DIR="/usr/local/lib/hecate"
START_NOW=1 START_NOW=1
INSTALL_DEPS=1
ENABLE_BOOTSTRAP="${HECATE_ENABLE_BOOTSTRAP:-0}"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case "$1" in case "$1" in
@ -19,6 +22,10 @@ while [[ $# -gt 0 ]]; do
START_NOW=0 START_NOW=0
shift shift
;; ;;
--skip-deps)
INSTALL_DEPS=0
shift
;;
*) *)
echo "Unknown argument: $1" >&2 echo "Unknown argument: $1" >&2
exit 1 exit 1
@ -26,6 +33,59 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
ensure_apt_packages() {
local missing=()
for pkg in "$@"; do
if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
missing+=("${pkg}")
fi
done
if [[ ${#missing[@]} -eq 0 ]]; then
return 0
fi
echo "[install] apt install: ${missing[*]}"
export DEBIAN_FRONTEND=noninteractive
apt-get update -y
apt-get install -y "${missing[@]}"
}
install_kubectl_if_missing() {
if command -v kubectl >/dev/null 2>&1; then
return 0
fi
ensure_apt_packages kubernetes-client || true
if command -v kubectl >/dev/null 2>&1; then
return 0
fi
echo "[install] installing kubectl via upstream binary"
local arch
arch="$(uname -m)"
case "${arch}" in
x86_64) arch="amd64" ;;
aarch64|arm64) arch="arm64" ;;
*) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
esac
local version
version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
chmod 0755 /usr/local/bin/kubectl
}
ensure_dependencies() {
if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
echo "[install] skipping dependency installation"
return 0
fi
if ! command -v apt-get >/dev/null 2>&1; then
echo "This installer currently supports apt-based hosts only." >&2
exit 1
fi
ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
install_kubectl_if_missing
}
ensure_dependencies
echo "[install] building hecate" echo "[install] building hecate"
cd "${REPO_DIR}" cd "${REPO_DIR}"
mkdir -p dist mkdir -p dist
@ -38,6 +98,7 @@ install -m 0755 dist/hecate "${BIN_DIR}/hecate"
echo "[install] installing config + state dirs" echo "[install] installing config + state dirs"
install -d -m 0750 "${CONF_DIR}" install -d -m 0750 "${CONF_DIR}"
install -d -m 0750 "${STATE_DIR}" install -d -m 0750 "${STATE_DIR}"
install -d -m 0755 "${LIB_DIR}"
if [[ ! -f "${CONF_DIR}/hecate.yaml" ]]; then if [[ ! -f "${CONF_DIR}/hecate.yaml" ]]; then
install -m 0640 configs/hecate.example.yaml "${CONF_DIR}/hecate.yaml" install -m 0640 configs/hecate.example.yaml "${CONF_DIR}/hecate.yaml"
echo "[install] wrote default config to ${CONF_DIR}/hecate.yaml" echo "[install] wrote default config to ${CONF_DIR}/hecate.yaml"
@ -48,12 +109,21 @@ fi
echo "[install] installing systemd units" echo "[install] installing systemd units"
install -m 0644 deploy/systemd/hecate.service "${SYSTEMD_DIR}/hecate.service" install -m 0644 deploy/systemd/hecate.service "${SYSTEMD_DIR}/hecate.service"
install -m 0644 deploy/systemd/hecate-bootstrap.service "${SYSTEMD_DIR}/hecate-bootstrap.service" install -m 0644 deploy/systemd/hecate-bootstrap.service "${SYSTEMD_DIR}/hecate-bootstrap.service"
install -m 0644 deploy/systemd/hecate-update.service "${SYSTEMD_DIR}/hecate-update.service"
install -m 0644 deploy/systemd/hecate-update.timer "${SYSTEMD_DIR}/hecate-update.timer"
install -m 0755 scripts/hecate-self-update.sh "${LIB_DIR}/hecate-self-update.sh"
systemctl daemon-reload systemctl daemon-reload
systemctl enable hecate.service hecate-bootstrap.service systemctl enable hecate.service hecate-update.timer
if [[ "${ENABLE_BOOTSTRAP}" == "1" ]]; then
systemctl enable hecate-bootstrap.service
else
systemctl disable hecate-bootstrap.service >/dev/null 2>&1 || true
fi
if [[ "${START_NOW}" -eq 1 ]]; then if [[ "${START_NOW}" -eq 1 ]]; then
systemctl restart hecate.service systemctl restart hecate.service
systemctl restart hecate-update.timer
echo "[install] hecate.service restarted" echo "[install] hecate.service restarted"
fi fi
@ -62,4 +132,5 @@ echo "Next steps:"
echo " 1. Edit /etc/hecate/hecate.yaml" echo " 1. Edit /etc/hecate/hecate.yaml"
echo " 2. Run: hecate status --config /etc/hecate/hecate.yaml" echo " 2. Run: hecate status --config /etc/hecate/hecate.yaml"
echo " 3. Test dry run: hecate startup --config /etc/hecate/hecate.yaml" echo " 3. Test dry run: hecate startup --config /etc/hecate/hecate.yaml"
echo " 4. Trigger bootstrap now: systemctl start hecate-bootstrap.service" echo " 4. Trigger bootstrap now (db host): systemctl start hecate-bootstrap.service"
echo " 5. Trigger self-update now: systemctl start hecate-update.service"