package service import ( "context" "fmt" "log" "math" "net/http" "os/exec" "strings" "time" "scm.bstein.dev/bstein/hecate/internal/cluster" "scm.bstein.dev/bstein/hecate/internal/config" "scm.bstein.dev/bstein/hecate/internal/metrics" "scm.bstein.dev/bstein/hecate/internal/ups" ) type Target struct { Name string Target string Provider ups.Provider } type Daemon struct { cfg config.Config orch *cluster.Orchestrator targets []Target log *log.Logger exporter *metrics.Exporter } func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, targets []Target, logger *log.Logger) *Daemon { return &Daemon{ cfg: cfg, orch: orch, targets: targets, log: logger, exporter: metrics.New(), } } func (d *Daemon) Run(ctx context.Context) error { if !d.cfg.UPS.Enabled { return fmt.Errorf("ups monitoring is disabled in config") } if len(d.targets) == 0 { return fmt.Errorf("no UPS targets configured") } poll := time.Duration(d.cfg.UPS.PollSeconds) * time.Second if poll <= 0 { poll = 5 * time.Second } telemetryTimeout := time.Duration(d.cfg.UPS.TelemetryTimeoutSeconds) * time.Second if telemetryTimeout <= 0 { telemetryTimeout = 90 * time.Second } debounce := d.cfg.UPS.DebounceCount if debounce <= 0 { debounce = 3 } if d.cfg.Metrics.Enabled { if err := d.startMetricsServer(); err != nil { return err } } t := time.NewTicker(poll) defer t.Stop() lastGood := map[string]time.Time{} lastOnBattery := map[string]bool{} breachCount := map[string]int{} for _, t := range d.targets { lastGood[t.Name] = time.Now() } d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s", poll, debounce, telemetryTimeout, d.targetList()) for { select { case <-ctx.Done(): return ctx.Err() case <-t.C: budget := d.orch.EstimatedShutdownSeconds() threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor)) d.exporter.UpdateBudget(budget) for _, target := range d.targets { sample, err := target.Provider.Read(ctx) if err != nil { d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err) d.exporter.UpdateSample(metrics.Sample{ Name: target.Name, Target: target.Target, ThresholdSec: threshold, BreachCount: breachCount[target.Name], LastError: err.Error(), UpdatedAt: time.Now().UTC(), }) if lastOnBattery[target.Name] && time.Since(lastGood[target.Name]) > telemetryTimeout { d.log.Printf("ups telemetry timeout while on battery (target=%s), triggering shutdown", target.Name) reason := fmt.Sprintf("ups-telemetry-timeout target=%s", target.Name) return d.triggerShutdown(ctx, reason) } continue } lastGood[target.Name] = time.Now() lastOnBattery[target.Name] = sample.OnBattery trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold) if trigger { breachCount[target.Name]++ } else { breachCount[target.Name] = 0 } d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d", target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name]) d.exporter.UpdateSample(metrics.Sample{ Name: target.Name, Target: target.Target, OnBattery: sample.OnBattery, LowBattery: sample.LowBattery, RuntimeSecond: sample.RuntimeSeconds, BatteryCharge: sample.BatteryCharge, LoadPercent: sample.LoadPercent, PowerNominalW: sample.NominalPowerW, ThresholdSec: threshold, Trigger: trigger, BreachCount: breachCount[target.Name], Status: sample.RawStatus, UpdatedAt: time.Now().UTC(), }) if breachCount[target.Name] >= debounce { reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus) return d.triggerShutdown(ctx, reason) } } } } } func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error { d.log.Printf("triggering shutdown: %s", reason) d.exporter.MarkShutdown(reason) if d.cfg.Coordination.ForwardShutdownHost != "" { if err := d.forwardShutdown(ctx, reason); err == nil { d.log.Printf("shutdown trigger forwarded to %s", d.cfg.Coordination.ForwardShutdownHost) return nil } else if !d.cfg.Coordination.FallbackLocalShutdown { return fmt.Errorf("forward shutdown failed and local fallback disabled: %w", err) } else { d.log.Printf("warning: forward shutdown failed; falling back to local shutdown: %v", err) } } return d.orch.Shutdown(ctx, cluster.ShutdownOptions{Reason: reason}) } func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error { userHost := d.cfg.Coordination.ForwardShutdownHost if d.cfg.Coordination.ForwardShutdownUser != "" { userHost = d.cfg.Coordination.ForwardShutdownUser + "@" + userHost } timeout := time.Duration(d.cfg.Coordination.CommandTimeoutSeconds) * time.Second if timeout <= 0 { timeout = 25 * time.Second } runCtx, cancel := context.WithTimeout(ctx, timeout) defer cancel() remoteCmd := fmt.Sprintf( "sudo /usr/local/bin/hecate shutdown --config %q --execute --reason %q", d.cfg.Coordination.ForwardShutdownConfig, reason, ) cmd := exec.CommandContext(runCtx, "ssh", "-o", "BatchMode=yes", "-o", "ConnectTimeout=8", userHost, remoteCmd) out, err := cmd.CombinedOutput() if err != nil { trimmed := strings.TrimSpace(string(out)) if trimmed == "" { return fmt.Errorf("forward shutdown via ssh failed: %w", err) } return fmt.Errorf("forward shutdown via ssh failed: %w: %s", err, trimmed) } return nil } func (d *Daemon) targetList() string { names := make([]string, 0, len(d.targets)) for _, t := range d.targets { names = append(names, t.Name+"="+t.Target) } return strings.Join(names, ",") } func (d *Daemon) startMetricsServer() error { if d.cfg.Metrics.BindAddr == "" { return fmt.Errorf("metrics.bind_addr must not be empty when metrics are enabled") } handler := d.exporter.Handler(d.cfg.Metrics.Path) srv := &http.Server{ Addr: d.cfg.Metrics.BindAddr, Handler: handler, ReadHeaderTimeout: 5 * time.Second, } go func() { d.log.Printf("metrics server listening on %s%s", d.cfg.Metrics.BindAddr, d.cfg.Metrics.Path) if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed { d.log.Printf("warning: metrics server failed: %v", err) } }() return nil }