package service import ( "context" "fmt" "log" "math" "time" "scm.bstein.dev/bstein/hecate/internal/cluster" "scm.bstein.dev/bstein/hecate/internal/config" "scm.bstein.dev/bstein/hecate/internal/ups" ) type Daemon struct { cfg config.Config orch *cluster.Orchestrator ups ups.Provider log *log.Logger } func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, provider ups.Provider, logger *log.Logger) *Daemon { return &Daemon{cfg: cfg, orch: orch, ups: provider, log: logger} } func (d *Daemon) Run(ctx context.Context) error { if !d.cfg.UPS.Enabled { return fmt.Errorf("ups monitoring is disabled in config") } poll := time.Duration(d.cfg.UPS.PollSeconds) * time.Second if poll <= 0 { poll = 5 * time.Second } telemetryTimeout := time.Duration(d.cfg.UPS.TelemetryTimeoutSeconds) * time.Second if telemetryTimeout <= 0 { telemetryTimeout = 90 * time.Second } debounce := d.cfg.UPS.DebounceCount if debounce <= 0 { debounce = 3 } lastGood := time.Now() lastOnBattery := false breachCount := 0 t := time.NewTicker(poll) defer t.Stop() d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s", poll, debounce, telemetryTimeout) for { select { case <-ctx.Done(): return ctx.Err() case <-t.C: sample, err := d.ups.Read(ctx) if err != nil { d.log.Printf("warning: ups read failed: %v", err) if lastOnBattery && time.Since(lastGood) > telemetryTimeout { d.log.Printf("ups telemetry timeout while on battery, triggering shutdown") return d.triggerShutdown(ctx, "ups-telemetry-timeout") } continue } lastGood = time.Now() lastOnBattery = sample.OnBattery budget := d.orch.EstimatedShutdownSeconds() threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor)) trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold) d.log.Printf("ups status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t", sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger) if trigger { breachCount++ if breachCount >= debounce { reason := fmt.Sprintf("ups-threshold runtime=%ds threshold=%ds status=%s", sample.RuntimeSeconds, threshold, sample.RawStatus) return d.triggerShutdown(ctx, reason) } } else { breachCount = 0 } } } } func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error { d.log.Printf("triggering shutdown: %s", reason) return d.orch.Shutdown(ctx, cluster.ShutdownOptions{Reason: reason}) }