2026-04-03 01:43:16 -03:00
|
|
|
package service
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"context"
|
|
|
|
|
"fmt"
|
|
|
|
|
"log"
|
|
|
|
|
"math"
|
2026-04-03 14:46:03 -03:00
|
|
|
"net/http"
|
|
|
|
|
"os/exec"
|
|
|
|
|
"strings"
|
2026-04-03 01:43:16 -03:00
|
|
|
"time"
|
|
|
|
|
|
|
|
|
|
"scm.bstein.dev/bstein/hecate/internal/cluster"
|
|
|
|
|
"scm.bstein.dev/bstein/hecate/internal/config"
|
2026-04-03 14:46:03 -03:00
|
|
|
"scm.bstein.dev/bstein/hecate/internal/metrics"
|
2026-04-03 01:43:16 -03:00
|
|
|
"scm.bstein.dev/bstein/hecate/internal/ups"
|
|
|
|
|
)
|
|
|
|
|
|
2026-04-03 14:46:03 -03:00
|
|
|
type Target struct {
|
|
|
|
|
Name string
|
|
|
|
|
Target string
|
|
|
|
|
Provider ups.Provider
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-03 01:43:16 -03:00
|
|
|
type Daemon struct {
|
2026-04-03 14:46:03 -03:00
|
|
|
cfg config.Config
|
|
|
|
|
orch *cluster.Orchestrator
|
|
|
|
|
targets []Target
|
|
|
|
|
log *log.Logger
|
|
|
|
|
exporter *metrics.Exporter
|
2026-04-03 01:43:16 -03:00
|
|
|
}
|
|
|
|
|
|
2026-04-03 14:46:03 -03:00
|
|
|
func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, targets []Target, logger *log.Logger) *Daemon {
|
|
|
|
|
return &Daemon{
|
|
|
|
|
cfg: cfg,
|
|
|
|
|
orch: orch,
|
|
|
|
|
targets: targets,
|
|
|
|
|
log: logger,
|
|
|
|
|
exporter: metrics.New(),
|
|
|
|
|
}
|
2026-04-03 01:43:16 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (d *Daemon) Run(ctx context.Context) error {
|
|
|
|
|
if !d.cfg.UPS.Enabled {
|
|
|
|
|
return fmt.Errorf("ups monitoring is disabled in config")
|
|
|
|
|
}
|
2026-04-03 14:46:03 -03:00
|
|
|
if len(d.targets) == 0 {
|
|
|
|
|
return fmt.Errorf("no UPS targets configured")
|
|
|
|
|
}
|
2026-04-03 01:43:16 -03:00
|
|
|
|
|
|
|
|
poll := time.Duration(d.cfg.UPS.PollSeconds) * time.Second
|
|
|
|
|
if poll <= 0 {
|
|
|
|
|
poll = 5 * time.Second
|
|
|
|
|
}
|
|
|
|
|
telemetryTimeout := time.Duration(d.cfg.UPS.TelemetryTimeoutSeconds) * time.Second
|
|
|
|
|
if telemetryTimeout <= 0 {
|
|
|
|
|
telemetryTimeout = 90 * time.Second
|
|
|
|
|
}
|
|
|
|
|
debounce := d.cfg.UPS.DebounceCount
|
|
|
|
|
if debounce <= 0 {
|
|
|
|
|
debounce = 3
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-03 14:46:03 -03:00
|
|
|
if d.cfg.Metrics.Enabled {
|
|
|
|
|
if err := d.startMetricsServer(); err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-04-03 01:43:16 -03:00
|
|
|
|
|
|
|
|
t := time.NewTicker(poll)
|
|
|
|
|
defer t.Stop()
|
|
|
|
|
|
2026-04-03 14:46:03 -03:00
|
|
|
lastGood := map[string]time.Time{}
|
|
|
|
|
lastOnBattery := map[string]bool{}
|
|
|
|
|
breachCount := map[string]int{}
|
|
|
|
|
for _, t := range d.targets {
|
|
|
|
|
lastGood[t.Name] = time.Now()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s",
|
|
|
|
|
poll, debounce, telemetryTimeout, d.targetList())
|
2026-04-03 01:43:16 -03:00
|
|
|
|
|
|
|
|
for {
|
|
|
|
|
select {
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
return ctx.Err()
|
|
|
|
|
case <-t.C:
|
2026-04-03 14:46:03 -03:00
|
|
|
budget := d.orch.EstimatedShutdownSeconds()
|
|
|
|
|
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
|
|
|
|
|
|
|
|
|
|
d.exporter.UpdateBudget(budget)
|
|
|
|
|
|
|
|
|
|
for _, target := range d.targets {
|
|
|
|
|
sample, err := target.Provider.Read(ctx)
|
|
|
|
|
if err != nil {
|
|
|
|
|
d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err)
|
|
|
|
|
d.exporter.UpdateSample(metrics.Sample{
|
|
|
|
|
Name: target.Name,
|
|
|
|
|
Target: target.Target,
|
|
|
|
|
ThresholdSec: threshold,
|
|
|
|
|
BreachCount: breachCount[target.Name],
|
|
|
|
|
LastError: err.Error(),
|
|
|
|
|
UpdatedAt: time.Now().UTC(),
|
|
|
|
|
})
|
|
|
|
|
if lastOnBattery[target.Name] && time.Since(lastGood[target.Name]) > telemetryTimeout {
|
|
|
|
|
d.log.Printf("ups telemetry timeout while on battery (target=%s), triggering shutdown", target.Name)
|
|
|
|
|
reason := fmt.Sprintf("ups-telemetry-timeout target=%s", target.Name)
|
|
|
|
|
return d.triggerShutdown(ctx, reason)
|
|
|
|
|
}
|
|
|
|
|
continue
|
2026-04-03 01:43:16 -03:00
|
|
|
}
|
|
|
|
|
|
2026-04-03 14:46:03 -03:00
|
|
|
lastGood[target.Name] = time.Now()
|
|
|
|
|
lastOnBattery[target.Name] = sample.OnBattery
|
2026-04-03 01:43:16 -03:00
|
|
|
|
2026-04-03 14:46:03 -03:00
|
|
|
trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold)
|
|
|
|
|
if trigger {
|
|
|
|
|
breachCount[target.Name]++
|
|
|
|
|
} else {
|
|
|
|
|
breachCount[target.Name] = 0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
|
|
|
|
|
target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
|
2026-04-03 01:43:16 -03:00
|
|
|
|
2026-04-03 14:46:03 -03:00
|
|
|
d.exporter.UpdateSample(metrics.Sample{
|
|
|
|
|
Name: target.Name,
|
|
|
|
|
Target: target.Target,
|
|
|
|
|
OnBattery: sample.OnBattery,
|
|
|
|
|
LowBattery: sample.LowBattery,
|
|
|
|
|
RuntimeSecond: sample.RuntimeSeconds,
|
2026-04-03 17:50:05 -03:00
|
|
|
BatteryCharge: sample.BatteryCharge,
|
|
|
|
|
LoadPercent: sample.LoadPercent,
|
2026-04-03 14:46:03 -03:00
|
|
|
ThresholdSec: threshold,
|
|
|
|
|
Trigger: trigger,
|
|
|
|
|
BreachCount: breachCount[target.Name],
|
|
|
|
|
Status: sample.RawStatus,
|
|
|
|
|
UpdatedAt: time.Now().UTC(),
|
|
|
|
|
})
|
2026-04-03 01:43:16 -03:00
|
|
|
|
2026-04-03 14:46:03 -03:00
|
|
|
if breachCount[target.Name] >= debounce {
|
|
|
|
|
reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
|
2026-04-03 01:43:16 -03:00
|
|
|
return d.triggerShutdown(ctx, reason)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error {
|
|
|
|
|
d.log.Printf("triggering shutdown: %s", reason)
|
2026-04-03 14:46:03 -03:00
|
|
|
d.exporter.MarkShutdown(reason)
|
|
|
|
|
if d.cfg.Coordination.ForwardShutdownHost != "" {
|
|
|
|
|
if err := d.forwardShutdown(ctx, reason); err == nil {
|
|
|
|
|
d.log.Printf("shutdown trigger forwarded to %s", d.cfg.Coordination.ForwardShutdownHost)
|
|
|
|
|
return nil
|
|
|
|
|
} else if !d.cfg.Coordination.FallbackLocalShutdown {
|
|
|
|
|
return fmt.Errorf("forward shutdown failed and local fallback disabled: %w", err)
|
|
|
|
|
} else {
|
|
|
|
|
d.log.Printf("warning: forward shutdown failed; falling back to local shutdown: %v", err)
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-04-03 01:43:16 -03:00
|
|
|
return d.orch.Shutdown(ctx, cluster.ShutdownOptions{Reason: reason})
|
|
|
|
|
}
|
2026-04-03 14:46:03 -03:00
|
|
|
|
|
|
|
|
func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error {
|
|
|
|
|
userHost := d.cfg.Coordination.ForwardShutdownHost
|
|
|
|
|
if d.cfg.Coordination.ForwardShutdownUser != "" {
|
|
|
|
|
userHost = d.cfg.Coordination.ForwardShutdownUser + "@" + userHost
|
|
|
|
|
}
|
|
|
|
|
timeout := time.Duration(d.cfg.Coordination.CommandTimeoutSeconds) * time.Second
|
|
|
|
|
if timeout <= 0 {
|
|
|
|
|
timeout = 25 * time.Second
|
|
|
|
|
}
|
|
|
|
|
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
|
|
|
|
defer cancel()
|
|
|
|
|
|
|
|
|
|
remoteCmd := fmt.Sprintf(
|
|
|
|
|
"sudo /usr/local/bin/hecate shutdown --config %q --execute --reason %q",
|
|
|
|
|
d.cfg.Coordination.ForwardShutdownConfig,
|
|
|
|
|
reason,
|
|
|
|
|
)
|
|
|
|
|
cmd := exec.CommandContext(runCtx, "ssh", "-o", "BatchMode=yes", "-o", "ConnectTimeout=8", userHost, remoteCmd)
|
|
|
|
|
out, err := cmd.CombinedOutput()
|
|
|
|
|
if err != nil {
|
|
|
|
|
trimmed := strings.TrimSpace(string(out))
|
|
|
|
|
if trimmed == "" {
|
|
|
|
|
return fmt.Errorf("forward shutdown via ssh failed: %w", err)
|
|
|
|
|
}
|
|
|
|
|
return fmt.Errorf("forward shutdown via ssh failed: %w: %s", err, trimmed)
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (d *Daemon) targetList() string {
|
|
|
|
|
names := make([]string, 0, len(d.targets))
|
|
|
|
|
for _, t := range d.targets {
|
|
|
|
|
names = append(names, t.Name+"="+t.Target)
|
|
|
|
|
}
|
|
|
|
|
return strings.Join(names, ",")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (d *Daemon) startMetricsServer() error {
|
|
|
|
|
if d.cfg.Metrics.BindAddr == "" {
|
|
|
|
|
return fmt.Errorf("metrics.bind_addr must not be empty when metrics are enabled")
|
|
|
|
|
}
|
|
|
|
|
handler := d.exporter.Handler(d.cfg.Metrics.Path)
|
|
|
|
|
srv := &http.Server{
|
|
|
|
|
Addr: d.cfg.Metrics.BindAddr,
|
|
|
|
|
Handler: handler,
|
|
|
|
|
ReadHeaderTimeout: 5 * time.Second,
|
|
|
|
|
}
|
|
|
|
|
go func() {
|
|
|
|
|
d.log.Printf("metrics server listening on %s%s", d.cfg.Metrics.BindAddr, d.cfg.Metrics.Path)
|
|
|
|
|
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
|
|
|
|
d.log.Printf("warning: metrics server failed: %v", err)
|
|
|
|
|
}
|
|
|
|
|
}()
|
|
|
|
|
return nil
|
|
|
|
|
}
|