95 lines
2.5 KiB
Go
95 lines
2.5 KiB
Go
|
|
package service
|
||
|
|
|
||
|
|
import (
|
||
|
|
"context"
|
||
|
|
"fmt"
|
||
|
|
"log"
|
||
|
|
"math"
|
||
|
|
"time"
|
||
|
|
|
||
|
|
"scm.bstein.dev/bstein/hecate/internal/cluster"
|
||
|
|
"scm.bstein.dev/bstein/hecate/internal/config"
|
||
|
|
"scm.bstein.dev/bstein/hecate/internal/ups"
|
||
|
|
)
|
||
|
|
|
||
|
|
type Daemon struct {
|
||
|
|
cfg config.Config
|
||
|
|
orch *cluster.Orchestrator
|
||
|
|
ups ups.Provider
|
||
|
|
log *log.Logger
|
||
|
|
}
|
||
|
|
|
||
|
|
func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, provider ups.Provider, logger *log.Logger) *Daemon {
|
||
|
|
return &Daemon{cfg: cfg, orch: orch, ups: provider, log: logger}
|
||
|
|
}
|
||
|
|
|
||
|
|
func (d *Daemon) Run(ctx context.Context) error {
|
||
|
|
if !d.cfg.UPS.Enabled {
|
||
|
|
return fmt.Errorf("ups monitoring is disabled in config")
|
||
|
|
}
|
||
|
|
|
||
|
|
poll := time.Duration(d.cfg.UPS.PollSeconds) * time.Second
|
||
|
|
if poll <= 0 {
|
||
|
|
poll = 5 * time.Second
|
||
|
|
}
|
||
|
|
telemetryTimeout := time.Duration(d.cfg.UPS.TelemetryTimeoutSeconds) * time.Second
|
||
|
|
if telemetryTimeout <= 0 {
|
||
|
|
telemetryTimeout = 90 * time.Second
|
||
|
|
}
|
||
|
|
debounce := d.cfg.UPS.DebounceCount
|
||
|
|
if debounce <= 0 {
|
||
|
|
debounce = 3
|
||
|
|
}
|
||
|
|
|
||
|
|
lastGood := time.Now()
|
||
|
|
lastOnBattery := false
|
||
|
|
breachCount := 0
|
||
|
|
|
||
|
|
t := time.NewTicker(poll)
|
||
|
|
defer t.Stop()
|
||
|
|
|
||
|
|
d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s", poll, debounce, telemetryTimeout)
|
||
|
|
|
||
|
|
for {
|
||
|
|
select {
|
||
|
|
case <-ctx.Done():
|
||
|
|
return ctx.Err()
|
||
|
|
case <-t.C:
|
||
|
|
sample, err := d.ups.Read(ctx)
|
||
|
|
if err != nil {
|
||
|
|
d.log.Printf("warning: ups read failed: %v", err)
|
||
|
|
if lastOnBattery && time.Since(lastGood) > telemetryTimeout {
|
||
|
|
d.log.Printf("ups telemetry timeout while on battery, triggering shutdown")
|
||
|
|
return d.triggerShutdown(ctx, "ups-telemetry-timeout")
|
||
|
|
}
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
|
||
|
|
lastGood = time.Now()
|
||
|
|
lastOnBattery = sample.OnBattery
|
||
|
|
|
||
|
|
budget := d.orch.EstimatedShutdownSeconds()
|
||
|
|
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
|
||
|
|
trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold)
|
||
|
|
|
||
|
|
d.log.Printf("ups status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t",
|
||
|
|
sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger)
|
||
|
|
|
||
|
|
if trigger {
|
||
|
|
breachCount++
|
||
|
|
if breachCount >= debounce {
|
||
|
|
reason := fmt.Sprintf("ups-threshold runtime=%ds threshold=%ds status=%s", sample.RuntimeSeconds, threshold, sample.RawStatus)
|
||
|
|
return d.triggerShutdown(ctx, reason)
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
breachCount = 0
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error {
|
||
|
|
d.log.Printf("triggering shutdown: %s", reason)
|
||
|
|
return d.orch.Shutdown(ctx, cluster.ShutdownOptions{Reason: reason})
|
||
|
|
}
|