From 27c7d119c09450b44b09beac3aae3dc158562327 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 3 Apr 2026 14:46:03 -0300 Subject: [PATCH] hecate: add multi-ups coordination, poweroff, metrics, and declarative self-update install --- README.md | 23 +++- cmd/hecate/main.go | 33 ++++- configs/hecate.example.yaml | 18 +++ configs/hecate.tethys.yaml | 57 ++++++++ configs/hecate.titan-db.yaml | 66 ++++++++++ deploy/systemd/hecate-update.service | 12 ++ deploy/systemd/hecate-update.timer | 12 ++ deploy/systemd/hecate.service | 2 +- internal/cluster/orchestrator.go | 62 +++++++++ internal/config/config.go | 110 ++++++++++++---- internal/config/config_test.go | 49 +++++++ internal/metrics/exporter.go | 150 +++++++++++++++++++++ internal/metrics/exporter_test.go | 44 +++++++ internal/service/daemon.go | 190 ++++++++++++++++++++++----- scripts/hecate-self-update.sh | 26 ++++ scripts/install.sh | 75 ++++++++++- 16 files changed, 864 insertions(+), 65 deletions(-) create mode 100644 configs/hecate.tethys.yaml create mode 100644 configs/hecate.titan-db.yaml create mode 100644 deploy/systemd/hecate-update.service create mode 100644 deploy/systemd/hecate-update.timer create mode 100644 internal/config/config_test.go create mode 100644 internal/metrics/exporter.go create mode 100644 internal/metrics/exporter_test.go create mode 100644 scripts/hecate-self-update.sh diff --git a/README.md b/README.md index a4a6e87..84948ce 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ It runs on `titan-db` and handles: - Staged **startup** (including Flux/Gitea bootstrap deadlock fallback) - Graceful **shutdown** - UPS-driven automatic shutdown decisions based on discharge/runtime +- Multi-UPS operation via multiple Hecate instances (for example `titan-db` + `tethys`) +- Full hardware poweroff sequencing after graceful Kubernetes shutdown ## Why host-level @@ -24,11 +26,17 @@ Hecate runs outside the cluster under systemd, so it can always orchestrate brin ```bash git clone git@gitea-admin:bstein/hecate.git cd hecate -sudo ./scripts/install.sh +sudo HECATE_ENABLE_BOOTSTRAP=1 ./scripts/install.sh sudoedit /etc/hecate/hecate.yaml sudo systemctl restart hecate.service ``` +The installer is idempotent: +- Re-runs safely on every update +- Preserves existing `/etc/hecate/hecate.yaml` +- Ensures required dependencies are installed (`kubectl`, `nut-*`, `ssh`, `go`, etc.) +- Installs/refreshes systemd units and enables boot-time self-update + Bootstrap now (without reboot): ```bash @@ -44,6 +52,13 @@ sudo systemctl start hecate-bootstrap.service - `systemctl start/stop k3s-agent` - UPS telemetry available via NUT (`upsc`) +## Multi-UPS topology + +Recommended: +- `titan-db` runs Hecate as the shutdown coordinator (local UPS target + local shutdown execution). +- `tethys` runs Hecate with local UPS target and forwards shutdown triggers to `titan-db`. +- If forwarding fails, fallback local shutdown can remain enabled. + ## Config See `configs/hecate.example.yaml`. @@ -55,7 +70,13 @@ UPS auto-shutdown trigger uses: Estimated shutdown budget is derived from historical successful shutdown runs (`/var/lib/hecate/runs.json`) with default fallback from config. +Power metrics: +- Hecate exposes Prometheus metrics on `:9560/metrics` by default. +- This is intended for a dedicated Grafana power dashboard and a high-level overview row. + ## Notes - Default behavior for `startup` and `shutdown` is dry-run unless `--execute` is set. - `hecate-bootstrap.service` is enabled to run at host boot and perform staged startup automatically. +- `HECATE_ENABLE_BOOTSTRAP=1` enables `hecate-bootstrap.service` (recommended on `titan-db`; keep disabled on non-coordinator hosts). +- `hecate-update.timer` runs on boot and periodically to pull latest `main` and reinstall Hecate declaratively. diff --git a/cmd/hecate/main.go b/cmd/hecate/main.go index b8694a5..327666a 100644 --- a/cmd/hecate/main.go +++ b/cmd/hecate/main.go @@ -84,6 +84,7 @@ func runShutdown(logger *log.Logger, args []string) error { execute := fs.Bool("execute", false, "Actually execute changes (default dry-run)") skipEtcd := fs.Bool("skip-etcd-snapshot", false, "Skip etcd snapshot") skipDrain := fs.Bool("skip-drain", false, "Skip worker drain") + reason := fs.String("reason", "manual-shutdown", "Shutdown reason for run history") _ = fs.Parse(args) _, orch, err := buildOrchestrator(logger, *configPath, !*execute) @@ -96,7 +97,7 @@ func runShutdown(logger *log.Logger, args []string) error { return orch.Shutdown(ctx, cluster.ShutdownOptions{ SkipEtcdSnapshot: *skipEtcd, SkipDrain: *skipDrain, - Reason: "manual-shutdown", + Reason: *reason, }) } @@ -113,15 +114,37 @@ func runDaemon(logger *log.Logger, args []string) error { if !cfg.UPS.Enabled { return fmt.Errorf("UPS monitoring is disabled in config") } - var provider ups.Provider + targets := make([]service.Target, 0, len(cfg.UPS.Targets)+1) switch cfg.UPS.Provider { case "nut": - provider = ups.NewNUTProvider(cfg.UPS.Target) + if len(cfg.UPS.Targets) == 0 { + target := cfg.UPS.Target + if target == "" { + return fmt.Errorf("ups target must be set") + } + targets = append(targets, service.Target{ + Name: "primary", + Target: target, + Provider: ups.NewNUTProvider(target), + }) + } else { + for idx, t := range cfg.UPS.Targets { + name := t.Name + if name == "" { + name = fmt.Sprintf("target-%d", idx+1) + } + targets = append(targets, service.Target{ + Name: name, + Target: t.Target, + Provider: ups.NewNUTProvider(t.Target), + }) + } + } default: return fmt.Errorf("unsupported UPS provider: %s", cfg.UPS.Provider) } - d := service.NewDaemon(cfg, orch, provider, logger) + d := service.NewDaemon(cfg, orch, targets, logger) ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) defer stop() return d.Run(ctx) @@ -184,7 +207,7 @@ Commands: Examples: hecate startup --config /etc/hecate/hecate.yaml --execute --force-flux-branch main - hecate shutdown --config /etc/hecate/hecate.yaml --execute + hecate shutdown --config /etc/hecate/hecate.yaml --execute --reason "manual-maintenance" hecate daemon --config /etc/hecate/hecate.yaml hecate status --config /etc/hecate/hecate.yaml `) diff --git a/configs/hecate.example.yaml b/configs/hecate.example.yaml index d27b369..1288ea3 100644 --- a/configs/hecate.example.yaml +++ b/configs/hecate.example.yaml @@ -34,14 +34,32 @@ shutdown: default_budget_seconds: 300 skip_etcd_snapshot: false skip_drain: false + poweroff_enabled: true + poweroff_delay_seconds: 25 + poweroff_local_host: true + extra_poweroff_hosts: + - titan-db ups: enabled: true provider: nut target: atlasups@localhost + targets: + - name: db-ups + target: atlasups@localhost poll_seconds: 5 runtime_safety_factor: 1.10 debounce_count: 3 telemetry_timeout_seconds: 90 +coordination: + forward_shutdown_host: "" + forward_shutdown_user: atlas + forward_shutdown_config: /etc/hecate/hecate.yaml + fallback_local_shutdown: true + command_timeout_seconds: 25 +metrics: + enabled: true + bind_addr: 0.0.0.0:9560 + path: /metrics state: dir: /var/lib/hecate run_history_path: /var/lib/hecate/runs.json diff --git a/configs/hecate.tethys.yaml b/configs/hecate.tethys.yaml new file mode 100644 index 0000000..ecee181 --- /dev/null +++ b/configs/hecate.tethys.yaml @@ -0,0 +1,57 @@ +# /etc/hecate/hecate.yaml for titan-24 (tethys forwarder) +kubeconfig: /home/tethys/.kube/config +ssh_user: atlas +iac_repo_path: /opt/titan-iac +expected_flux_branch: main +control_planes: + - titan-0a + - titan-0b + - titan-0c +workers: [] +local_bootstrap_paths: + - infrastructure/core +excluded_namespaces: + - kube-system + - kube-public + - kube-node-lease + - flux-system + - traefik + - metallb-system + - cert-manager + - longhorn-system + - vault + - postgres + - maintenance +shutdown: + default_budget_seconds: 300 + skip_etcd_snapshot: false + skip_drain: false + poweroff_enabled: true + poweroff_delay_seconds: 25 + poweroff_local_host: true + extra_poweroff_hosts: [] +ups: + enabled: true + provider: nut + targets: + - name: tethys-ups + target: atlasups@localhost + poll_seconds: 5 + runtime_safety_factor: 1.10 + debounce_count: 3 + telemetry_timeout_seconds: 90 +coordination: + forward_shutdown_host: titan-db + forward_shutdown_user: atlas + forward_shutdown_config: /etc/hecate/hecate.yaml + fallback_local_shutdown: false + command_timeout_seconds: 25 +metrics: + enabled: true + bind_addr: 0.0.0.0:9560 + path: /metrics +state: + dir: /var/lib/hecate + run_history_path: /var/lib/hecate/runs.json + lock_path: /var/lib/hecate/hecate.lock + diff --git a/configs/hecate.titan-db.yaml b/configs/hecate.titan-db.yaml new file mode 100644 index 0000000..67c40bd --- /dev/null +++ b/configs/hecate.titan-db.yaml @@ -0,0 +1,66 @@ +# /etc/hecate/hecate.yaml for titan-db (coordinator) +kubeconfig: /home/atlas/.kube/config +ssh_user: atlas +iac_repo_path: /opt/titan-iac +expected_flux_branch: main +control_planes: + - titan-0a + - titan-0b + - titan-0c +workers: [] +local_bootstrap_paths: + - infrastructure/core + - infrastructure/sources/helm + - infrastructure/metallb + - infrastructure/traefik + - infrastructure/vault-csi + - infrastructure/vault-injector + - services/vault + - infrastructure/postgres + - services/gitea +excluded_namespaces: + - kube-system + - kube-public + - kube-node-lease + - flux-system + - traefik + - metallb-system + - cert-manager + - longhorn-system + - vault + - postgres + - maintenance +shutdown: + default_budget_seconds: 300 + skip_etcd_snapshot: false + skip_drain: false + poweroff_enabled: true + poweroff_delay_seconds: 25 + poweroff_local_host: true + extra_poweroff_hosts: + - titan-db +ups: + enabled: true + provider: nut + targets: + - name: db-ups + target: atlasups@localhost + poll_seconds: 5 + runtime_safety_factor: 1.10 + debounce_count: 3 + telemetry_timeout_seconds: 90 +coordination: + forward_shutdown_host: "" + forward_shutdown_user: atlas + forward_shutdown_config: /etc/hecate/hecate.yaml + fallback_local_shutdown: true + command_timeout_seconds: 25 +metrics: + enabled: true + bind_addr: 0.0.0.0:9560 + path: /metrics +state: + dir: /var/lib/hecate + run_history_path: /var/lib/hecate/runs.json + lock_path: /var/lib/hecate/hecate.lock + diff --git a/deploy/systemd/hecate-update.service b/deploy/systemd/hecate-update.service new file mode 100644 index 0000000..2d42896 --- /dev/null +++ b/deploy/systemd/hecate-update.service @@ -0,0 +1,12 @@ +[Unit] +Description=Hecate Self-Update and Reinstall +Wants=network-online.target +After=network-online.target + +[Service] +Type=oneshot +User=root +Group=root +ExecStart=/usr/local/lib/hecate/hecate-self-update.sh +TimeoutStartSec=1800 + diff --git a/deploy/systemd/hecate-update.timer b/deploy/systemd/hecate-update.timer new file mode 100644 index 0000000..ce7d193 --- /dev/null +++ b/deploy/systemd/hecate-update.timer @@ -0,0 +1,12 @@ +[Unit] +Description=Periodic Hecate Self-Update Timer + +[Timer] +OnBootSec=2m +OnUnitActiveSec=6h +Unit=hecate-update.service +Persistent=true + +[Install] +WantedBy=timers.target + diff --git a/deploy/systemd/hecate.service b/deploy/systemd/hecate.service index cc78d20..c19e55d 100644 --- a/deploy/systemd/hecate.service +++ b/deploy/systemd/hecate.service @@ -9,7 +9,7 @@ Type=simple User=root Group=root ExecStart=/usr/local/bin/hecate daemon --config /etc/hecate/hecate.yaml -Restart=always +Restart=on-failure RestartSec=5 NoNewPrivileges=true diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index 5dcfa20..02a7194 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -4,7 +4,9 @@ import ( "context" "fmt" "log" + "os" "path/filepath" + "sort" "strings" "time" @@ -132,6 +134,9 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err o.stopWorkers(ctx, workers) o.stopControlPlanes(ctx, o.cfg.ControlPlanes) + if o.cfg.Shutdown.PoweroffEnabled { + o.bestEffort("poweroff hosts", func() error { return o.poweroffHosts(ctx, workers) }) + } o.log.Printf("shutdown flow complete") return nil } @@ -413,3 +418,60 @@ func (o *Orchestrator) bestEffort(name string, fn func() error) { o.log.Printf("warning: %s: %v", name, err) } } + +func (o *Orchestrator) poweroffHosts(ctx context.Context, workers []string) error { + delay := o.cfg.Shutdown.PoweroffDelaySeconds + if delay <= 0 { + delay = 25 + } + + localNames := map[string]struct{}{} + if hn, err := os.Hostname(); err == nil && strings.TrimSpace(hn) != "" { + localNames[strings.TrimSpace(hn)] = struct{}{} + } + if o.cfg.SSHUser != "" { + localNames[o.cfg.SSHUser] = struct{}{} + } + + hostSet := map[string]struct{}{} + for _, n := range o.cfg.ControlPlanes { + hostSet[n] = struct{}{} + } + for _, n := range workers { + hostSet[n] = struct{}{} + } + for _, n := range o.cfg.Shutdown.ExtraPoweroffHosts { + if strings.TrimSpace(n) != "" { + hostSet[strings.TrimSpace(n)] = struct{}{} + } + } + + hosts := make([]string, 0, len(hostSet)) + for h := range hostSet { + hosts = append(hosts, h) + } + sort.Strings(hosts) + + remoteCmd := fmt.Sprintf(`sudo nohup sh -c 'sleep %d; systemctl poweroff' >/dev/null 2>&1 &`, delay) + for _, host := range hosts { + host = strings.TrimSpace(host) + if host == "" { + continue + } + if _, isLocal := localNames[host]; isLocal { + continue + } + o.bestEffort("schedule poweroff on "+host, func() error { + _, err := o.ssh(ctx, host, remoteCmd) + return err + }) + } + + if o.cfg.Shutdown.PoweroffLocalHost { + o.bestEffort("schedule local host poweroff", func() error { + _, err := o.run(ctx, 5*time.Second, "sh", "-c", fmt.Sprintf("nohup sh -c 'sleep %d; systemctl poweroff' >/dev/null 2>&1 &", delay+10)) + return err + }) + } + return nil +} diff --git a/internal/config/config.go b/internal/config/config.go index 3e32a42..4b33084 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -8,33 +8,59 @@ import ( ) type Config struct { - Kubeconfig string `yaml:"kubeconfig"` - SSHUser string `yaml:"ssh_user"` - IACRepoPath string `yaml:"iac_repo_path"` - ExpectedFluxBranch string `yaml:"expected_flux_branch"` - ControlPlanes []string `yaml:"control_planes"` - Workers []string `yaml:"workers"` - LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"` - ExcludedNamespaces []string `yaml:"excluded_namespaces"` - Shutdown Shutdown `yaml:"shutdown"` - UPS UPS `yaml:"ups"` - State State `yaml:"state"` + Kubeconfig string `yaml:"kubeconfig"` + SSHUser string `yaml:"ssh_user"` + IACRepoPath string `yaml:"iac_repo_path"` + ExpectedFluxBranch string `yaml:"expected_flux_branch"` + ControlPlanes []string `yaml:"control_planes"` + Workers []string `yaml:"workers"` + LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"` + ExcludedNamespaces []string `yaml:"excluded_namespaces"` + Shutdown Shutdown `yaml:"shutdown"` + UPS UPS `yaml:"ups"` + Coordination Coordination `yaml:"coordination"` + Metrics Metrics `yaml:"metrics"` + State State `yaml:"state"` } type Shutdown struct { - DefaultBudgetSeconds int `yaml:"default_budget_seconds"` - SkipEtcdSnapshot bool `yaml:"skip_etcd_snapshot"` - SkipDrain bool `yaml:"skip_drain"` + DefaultBudgetSeconds int `yaml:"default_budget_seconds"` + SkipEtcdSnapshot bool `yaml:"skip_etcd_snapshot"` + SkipDrain bool `yaml:"skip_drain"` + PoweroffEnabled bool `yaml:"poweroff_enabled"` + PoweroffDelaySeconds int `yaml:"poweroff_delay_seconds"` + PoweroffLocalHost bool `yaml:"poweroff_local_host"` + ExtraPoweroffHosts []string `yaml:"extra_poweroff_hosts"` } type UPS struct { - Enabled bool `yaml:"enabled"` - Provider string `yaml:"provider"` - Target string `yaml:"target"` - PollSeconds int `yaml:"poll_seconds"` - RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"` - DebounceCount int `yaml:"debounce_count"` - TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"` + Enabled bool `yaml:"enabled"` + Provider string `yaml:"provider"` + Target string `yaml:"target"` + Targets []UPSTarget `yaml:"targets"` + PollSeconds int `yaml:"poll_seconds"` + RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"` + DebounceCount int `yaml:"debounce_count"` + TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"` +} + +type UPSTarget struct { + Name string `yaml:"name"` + Target string `yaml:"target"` +} + +type Coordination struct { + ForwardShutdownHost string `yaml:"forward_shutdown_host"` + ForwardShutdownUser string `yaml:"forward_shutdown_user"` + ForwardShutdownConfig string `yaml:"forward_shutdown_config"` + FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"` + CommandTimeoutSeconds int `yaml:"command_timeout_seconds"` +} + +type Metrics struct { + Enabled bool `yaml:"enabled"` + BindAddr string `yaml:"bind_addr"` + Path string `yaml:"path"` } type State struct { @@ -78,8 +104,18 @@ func (c Config) Validate() error { if c.UPS.Provider == "" { return fmt.Errorf("config.ups.provider must not be empty when ups is enabled") } - if c.UPS.Target == "" { - return fmt.Errorf("config.ups.target must not be empty when ups is enabled") + if c.UPS.Target == "" && len(c.UPS.Targets) == 0 { + return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled") + } + for _, t := range c.UPS.Targets { + if t.Target == "" { + return fmt.Errorf("config.ups.targets[].target must not be empty") + } + } + } + if c.Coordination.ForwardShutdownHost != "" { + if c.Coordination.ForwardShutdownConfig == "" { + return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set") } } if c.State.RunHistoryPath == "" || c.State.LockPath == "" { @@ -119,6 +155,9 @@ func defaults() Config { }, Shutdown: Shutdown{ DefaultBudgetSeconds: 300, + PoweroffEnabled: true, + PoweroffDelaySeconds: 25, + PoweroffLocalHost: true, }, UPS: UPS{ Enabled: true, @@ -128,6 +167,16 @@ func defaults() Config { DebounceCount: 3, TelemetryTimeoutSeconds: 90, }, + Coordination: Coordination{ + ForwardShutdownConfig: "/etc/hecate/hecate.yaml", + FallbackLocalShutdown: true, + CommandTimeoutSeconds: 25, + }, + Metrics: Metrics{ + Enabled: true, + BindAddr: "0.0.0.0:9560", + Path: "/metrics", + }, State: State{ Dir: "/var/lib/hecate", RunHistoryPath: "/var/lib/hecate/runs.json", @@ -148,6 +197,9 @@ func (c *Config) applyDefaults() { if c.Shutdown.DefaultBudgetSeconds <= 0 { c.Shutdown.DefaultBudgetSeconds = 300 } + if c.Shutdown.PoweroffDelaySeconds <= 0 { + c.Shutdown.PoweroffDelaySeconds = 25 + } if c.UPS.PollSeconds <= 0 { c.UPS.PollSeconds = 5 } @@ -160,6 +212,18 @@ func (c *Config) applyDefaults() { if c.UPS.TelemetryTimeoutSeconds <= 0 { c.UPS.TelemetryTimeoutSeconds = 90 } + if c.Coordination.ForwardShutdownConfig == "" { + c.Coordination.ForwardShutdownConfig = "/etc/hecate/hecate.yaml" + } + if c.Coordination.CommandTimeoutSeconds <= 0 { + c.Coordination.CommandTimeoutSeconds = 25 + } + if c.Metrics.BindAddr == "" { + c.Metrics.BindAddr = "0.0.0.0:9560" + } + if c.Metrics.Path == "" { + c.Metrics.Path = "/metrics" + } if c.State.Dir == "" { c.State.Dir = "/var/lib/hecate" } diff --git a/internal/config/config_test.go b/internal/config/config_test.go new file mode 100644 index 0000000..c10e12b --- /dev/null +++ b/internal/config/config_test.go @@ -0,0 +1,49 @@ +package config + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestLoadAcceptsUPSTargets(t *testing.T) { + tmp := t.TempDir() + cfgPath := filepath.Join(tmp, "hecate.yaml") + raw := ` +control_planes: [titan-0a, titan-0b, titan-0c] +expected_flux_branch: main +iac_repo_path: /opt/titan-iac +ups: + enabled: true + provider: nut + targets: + - name: db + target: atlasups@localhost +shutdown: + default_budget_seconds: 300 +state: + run_history_path: /tmp/runs.json + lock_path: /tmp/hecate.lock +` + if err := os.WriteFile(cfgPath, []byte(strings.TrimSpace(raw)), 0o644); err != nil { + t.Fatalf("write config: %v", err) + } + + cfg, err := Load(cfgPath) + if err != nil { + t.Fatalf("load config: %v", err) + } + if len(cfg.UPS.Targets) != 1 || cfg.UPS.Targets[0].Target != "atlasups@localhost" { + t.Fatalf("unexpected UPS targets: %#v", cfg.UPS.Targets) + } +} + +func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) { + cfg := defaults() + cfg.Coordination.ForwardShutdownHost = "titan-db" + cfg.Coordination.ForwardShutdownConfig = "" + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error for missing forward_shutdown_config") + } +} diff --git a/internal/metrics/exporter.go b/internal/metrics/exporter.go new file mode 100644 index 0000000..4ab2b84 --- /dev/null +++ b/internal/metrics/exporter.go @@ -0,0 +1,150 @@ +package metrics + +import ( + "fmt" + "net/http" + "sort" + "strings" + "sync" + "time" +) + +type Sample struct { + Name string + Target string + OnBattery bool + LowBattery bool + RuntimeSecond int + ThresholdSec int + Trigger bool + BreachCount int + Status string + LastError string + UpdatedAt time.Time +} + +type Exporter struct { + mu sync.RWMutex + shutdownBudgetSec int + shutdownTriggers int + lastShutdownReason string + lastShutdownAt time.Time + samples map[string]Sample +} + +func New() *Exporter { + return &Exporter{ + samples: make(map[string]Sample), + } +} + +func (e *Exporter) UpdateBudget(seconds int) { + e.mu.Lock() + defer e.mu.Unlock() + e.shutdownBudgetSec = seconds +} + +func (e *Exporter) UpdateSample(s Sample) { + e.mu.Lock() + defer e.mu.Unlock() + if s.UpdatedAt.IsZero() { + s.UpdatedAt = time.Now().UTC() + } + e.samples[s.Name] = s +} + +func (e *Exporter) MarkShutdown(reason string) { + e.mu.Lock() + defer e.mu.Unlock() + e.shutdownTriggers++ + e.lastShutdownReason = reason + e.lastShutdownAt = time.Now().UTC() +} + +func (e *Exporter) Handler(path string) http.Handler { + mux := http.NewServeMux() + metricsPath := path + if metricsPath == "" { + metricsPath = "/metrics" + } + mux.HandleFunc(metricsPath, e.serveMetrics) + mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ok\n")) + }) + return mux +} + +func (e *Exporter) serveMetrics(w http.ResponseWriter, _ *http.Request) { + e.mu.RLock() + defer e.mu.RUnlock() + + w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + var b strings.Builder + b.WriteString("# HELP hecate_shutdown_budget_seconds Estimated graceful shutdown budget in seconds (p95).\n") + b.WriteString("# TYPE hecate_shutdown_budget_seconds gauge\n") + b.WriteString(fmt.Sprintf("hecate_shutdown_budget_seconds %d\n", e.shutdownBudgetSec)) + b.WriteString("# HELP hecate_shutdown_triggers_total Total number of shutdown triggers issued by this instance.\n") + b.WriteString("# TYPE hecate_shutdown_triggers_total counter\n") + b.WriteString(fmt.Sprintf("hecate_shutdown_triggers_total %d\n", e.shutdownTriggers)) + b.WriteString("# HELP hecate_shutdown_last_trigger_timestamp_seconds Unix timestamp of last shutdown trigger.\n") + b.WriteString("# TYPE hecate_shutdown_last_trigger_timestamp_seconds gauge\n") + if e.lastShutdownAt.IsZero() { + b.WriteString("hecate_shutdown_last_trigger_timestamp_seconds 0\n") + } else { + b.WriteString(fmt.Sprintf("hecate_shutdown_last_trigger_timestamp_seconds %d\n", e.lastShutdownAt.Unix())) + } + b.WriteString("# HELP hecate_ups_on_battery Whether a UPS source is currently on battery.\n") + b.WriteString("# TYPE hecate_ups_on_battery gauge\n") + b.WriteString("# HELP hecate_ups_low_battery Whether a UPS source currently reports low battery.\n") + b.WriteString("# TYPE hecate_ups_low_battery gauge\n") + b.WriteString("# HELP hecate_ups_runtime_seconds Battery runtime remaining reported by UPS.\n") + b.WriteString("# TYPE hecate_ups_runtime_seconds gauge\n") + b.WriteString("# HELP hecate_ups_threshold_seconds Red-line threshold for runtime-based shutdown.\n") + b.WriteString("# TYPE hecate_ups_threshold_seconds gauge\n") + b.WriteString("# HELP hecate_ups_trigger_active Whether this UPS source currently breaches shutdown trigger conditions.\n") + b.WriteString("# TYPE hecate_ups_trigger_active gauge\n") + b.WriteString("# HELP hecate_ups_breach_count Current debounce breach count for this UPS source.\n") + b.WriteString("# TYPE hecate_ups_breach_count gauge\n") + b.WriteString("# HELP hecate_ups_last_sample_timestamp_seconds Unix timestamp of most recent sample.\n") + b.WriteString("# TYPE hecate_ups_last_sample_timestamp_seconds gauge\n") + b.WriteString("# HELP hecate_ups_error Whether the last sample had an error.\n") + b.WriteString("# TYPE hecate_ups_error gauge\n") + + names := make([]string, 0, len(e.samples)) + for name := range e.samples { + names = append(names, name) + } + sort.Strings(names) + for _, name := range names { + s := e.samples[name] + labels := fmt.Sprintf("{source=%q,target=%q,status=%q,last_reason=%q}", + safe(name), safe(s.Target), safe(s.Status), safe(e.lastShutdownReason)) + b.WriteString(fmt.Sprintf("hecate_ups_on_battery%s %d\n", labels, boolNum(s.OnBattery))) + b.WriteString(fmt.Sprintf("hecate_ups_low_battery%s %d\n", labels, boolNum(s.LowBattery))) + b.WriteString(fmt.Sprintf("hecate_ups_runtime_seconds%s %d\n", labels, s.RuntimeSecond)) + b.WriteString(fmt.Sprintf("hecate_ups_threshold_seconds%s %d\n", labels, s.ThresholdSec)) + b.WriteString(fmt.Sprintf("hecate_ups_trigger_active%s %d\n", labels, boolNum(s.Trigger))) + b.WriteString(fmt.Sprintf("hecate_ups_breach_count%s %d\n", labels, s.BreachCount)) + if s.UpdatedAt.IsZero() { + b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s 0\n", labels)) + } else { + b.WriteString(fmt.Sprintf("hecate_ups_last_sample_timestamp_seconds%s %d\n", labels, s.UpdatedAt.Unix())) + } + b.WriteString(fmt.Sprintf("hecate_ups_error%s %d\n", labels, boolNum(s.LastError != ""))) + } + + _, _ = w.Write([]byte(b.String())) +} + +func boolNum(v bool) int { + if v { + return 1 + } + return 0 +} + +func safe(in string) string { + out := strings.ReplaceAll(in, "\\", "\\\\") + return strings.ReplaceAll(out, "\"", "\\\"") +} diff --git a/internal/metrics/exporter_test.go b/internal/metrics/exporter_test.go new file mode 100644 index 0000000..3e27ad5 --- /dev/null +++ b/internal/metrics/exporter_test.go @@ -0,0 +1,44 @@ +package metrics + +import ( + "net/http/httptest" + "strings" + "testing" + "time" +) + +func TestExporterEmitsCoreMetrics(t *testing.T) { + e := New() + e.UpdateBudget(321) + e.UpdateSample(Sample{ + Name: "db-ups", + Target: "atlasups@localhost", + OnBattery: true, + LowBattery: false, + RuntimeSecond: 412, + ThresholdSec: 354, + Trigger: true, + BreachCount: 2, + Status: "OB", + UpdatedAt: time.Unix(1710000000, 0).UTC(), + }) + e.MarkShutdown("ups-threshold") + + req := httptest.NewRequest("GET", "/metrics", nil) + rr := httptest.NewRecorder() + e.Handler("/metrics").ServeHTTP(rr, req) + body := rr.Body.String() + + mustContain := []string{ + "hecate_shutdown_budget_seconds 321", + "hecate_shutdown_triggers_total 1", + "hecate_ups_on_battery{source=\"db-ups\",target=\"atlasups@localhost\"", + "hecate_ups_runtime_seconds{source=\"db-ups\",target=\"atlasups@localhost\"", + "hecate_ups_threshold_seconds{source=\"db-ups\",target=\"atlasups@localhost\"", + } + for _, m := range mustContain { + if !strings.Contains(body, m) { + t.Fatalf("missing metric fragment %q in output:\n%s", m, body) + } + } +} diff --git a/internal/service/daemon.go b/internal/service/daemon.go index 91d8018..d04e3ab 100644 --- a/internal/service/daemon.go +++ b/internal/service/daemon.go @@ -5,28 +5,48 @@ import ( "fmt" "log" "math" + "net/http" + "os/exec" + "strings" "time" "scm.bstein.dev/bstein/hecate/internal/cluster" "scm.bstein.dev/bstein/hecate/internal/config" + "scm.bstein.dev/bstein/hecate/internal/metrics" "scm.bstein.dev/bstein/hecate/internal/ups" ) -type Daemon struct { - cfg config.Config - orch *cluster.Orchestrator - ups ups.Provider - log *log.Logger +type Target struct { + Name string + Target string + Provider ups.Provider } -func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, provider ups.Provider, logger *log.Logger) *Daemon { - return &Daemon{cfg: cfg, orch: orch, ups: provider, log: logger} +type Daemon struct { + cfg config.Config + orch *cluster.Orchestrator + targets []Target + log *log.Logger + exporter *metrics.Exporter +} + +func NewDaemon(cfg config.Config, orch *cluster.Orchestrator, targets []Target, logger *log.Logger) *Daemon { + return &Daemon{ + cfg: cfg, + orch: orch, + targets: targets, + log: logger, + exporter: metrics.New(), + } } func (d *Daemon) Run(ctx context.Context) error { if !d.cfg.UPS.Enabled { return fmt.Errorf("ups monitoring is disabled in config") } + if len(d.targets) == 0 { + return fmt.Errorf("no UPS targets configured") + } poll := time.Duration(d.cfg.UPS.PollSeconds) * time.Second if poll <= 0 { @@ -41,48 +61,85 @@ func (d *Daemon) Run(ctx context.Context) error { debounce = 3 } - lastGood := time.Now() - lastOnBattery := false - breachCount := 0 + if d.cfg.Metrics.Enabled { + if err := d.startMetricsServer(); err != nil { + return err + } + } t := time.NewTicker(poll) defer t.Stop() - d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s", poll, debounce, telemetryTimeout) + lastGood := map[string]time.Time{} + lastOnBattery := map[string]bool{} + breachCount := map[string]int{} + for _, t := range d.targets { + lastGood[t.Name] = time.Now() + } + + d.log.Printf("hecate daemon started: poll=%s debounce=%d telemetry_timeout=%s targets=%s", + poll, debounce, telemetryTimeout, d.targetList()) for { select { case <-ctx.Done(): return ctx.Err() case <-t.C: - sample, err := d.ups.Read(ctx) - if err != nil { - d.log.Printf("warning: ups read failed: %v", err) - if lastOnBattery && time.Since(lastGood) > telemetryTimeout { - d.log.Printf("ups telemetry timeout while on battery, triggering shutdown") - return d.triggerShutdown(ctx, "ups-telemetry-timeout") - } - continue - } - - lastGood = time.Now() - lastOnBattery = sample.OnBattery - budget := d.orch.EstimatedShutdownSeconds() threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor)) - trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold) - d.log.Printf("ups status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t", - sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger) + d.exporter.UpdateBudget(budget) - if trigger { - breachCount++ - if breachCount >= debounce { - reason := fmt.Sprintf("ups-threshold runtime=%ds threshold=%ds status=%s", sample.RuntimeSeconds, threshold, sample.RawStatus) + for _, target := range d.targets { + sample, err := target.Provider.Read(ctx) + if err != nil { + d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err) + d.exporter.UpdateSample(metrics.Sample{ + Name: target.Name, + Target: target.Target, + ThresholdSec: threshold, + BreachCount: breachCount[target.Name], + LastError: err.Error(), + UpdatedAt: time.Now().UTC(), + }) + if lastOnBattery[target.Name] && time.Since(lastGood[target.Name]) > telemetryTimeout { + d.log.Printf("ups telemetry timeout while on battery (target=%s), triggering shutdown", target.Name) + reason := fmt.Sprintf("ups-telemetry-timeout target=%s", target.Name) + return d.triggerShutdown(ctx, reason) + } + continue + } + + lastGood[target.Name] = time.Now() + lastOnBattery[target.Name] = sample.OnBattery + + trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold) + if trigger { + breachCount[target.Name]++ + } else { + breachCount[target.Name] = 0 + } + + d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d", + target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name]) + + d.exporter.UpdateSample(metrics.Sample{ + Name: target.Name, + Target: target.Target, + OnBattery: sample.OnBattery, + LowBattery: sample.LowBattery, + RuntimeSecond: sample.RuntimeSeconds, + ThresholdSec: threshold, + Trigger: trigger, + BreachCount: breachCount[target.Name], + Status: sample.RawStatus, + UpdatedAt: time.Now().UTC(), + }) + + if breachCount[target.Name] >= debounce { + reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus) return d.triggerShutdown(ctx, reason) } - } else { - breachCount = 0 } } } @@ -90,5 +147,72 @@ func (d *Daemon) Run(ctx context.Context) error { func (d *Daemon) triggerShutdown(ctx context.Context, reason string) error { d.log.Printf("triggering shutdown: %s", reason) + d.exporter.MarkShutdown(reason) + if d.cfg.Coordination.ForwardShutdownHost != "" { + if err := d.forwardShutdown(ctx, reason); err == nil { + d.log.Printf("shutdown trigger forwarded to %s", d.cfg.Coordination.ForwardShutdownHost) + return nil + } else if !d.cfg.Coordination.FallbackLocalShutdown { + return fmt.Errorf("forward shutdown failed and local fallback disabled: %w", err) + } else { + d.log.Printf("warning: forward shutdown failed; falling back to local shutdown: %v", err) + } + } return d.orch.Shutdown(ctx, cluster.ShutdownOptions{Reason: reason}) } + +func (d *Daemon) forwardShutdown(ctx context.Context, reason string) error { + userHost := d.cfg.Coordination.ForwardShutdownHost + if d.cfg.Coordination.ForwardShutdownUser != "" { + userHost = d.cfg.Coordination.ForwardShutdownUser + "@" + userHost + } + timeout := time.Duration(d.cfg.Coordination.CommandTimeoutSeconds) * time.Second + if timeout <= 0 { + timeout = 25 * time.Second + } + runCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + remoteCmd := fmt.Sprintf( + "sudo /usr/local/bin/hecate shutdown --config %q --execute --reason %q", + d.cfg.Coordination.ForwardShutdownConfig, + reason, + ) + cmd := exec.CommandContext(runCtx, "ssh", "-o", "BatchMode=yes", "-o", "ConnectTimeout=8", userHost, remoteCmd) + out, err := cmd.CombinedOutput() + if err != nil { + trimmed := strings.TrimSpace(string(out)) + if trimmed == "" { + return fmt.Errorf("forward shutdown via ssh failed: %w", err) + } + return fmt.Errorf("forward shutdown via ssh failed: %w: %s", err, trimmed) + } + return nil +} + +func (d *Daemon) targetList() string { + names := make([]string, 0, len(d.targets)) + for _, t := range d.targets { + names = append(names, t.Name+"="+t.Target) + } + return strings.Join(names, ",") +} + +func (d *Daemon) startMetricsServer() error { + if d.cfg.Metrics.BindAddr == "" { + return fmt.Errorf("metrics.bind_addr must not be empty when metrics are enabled") + } + handler := d.exporter.Handler(d.cfg.Metrics.Path) + srv := &http.Server{ + Addr: d.cfg.Metrics.BindAddr, + Handler: handler, + ReadHeaderTimeout: 5 * time.Second, + } + go func() { + d.log.Printf("metrics server listening on %s%s", d.cfg.Metrics.BindAddr, d.cfg.Metrics.Path) + if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed { + d.log.Printf("warning: metrics server failed: %v", err) + } + }() + return nil +} diff --git a/scripts/hecate-self-update.sh b/scripts/hecate-self-update.sh new file mode 100644 index 0000000..ff66f80 --- /dev/null +++ b/scripts/hecate-self-update.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ "${EUID}" -ne 0 ]]; then + echo "hecate-self-update.sh must run as root" >&2 + exit 1 +fi + +REPO_URL="${HECATE_REPO_URL:-https://scm.bstein.dev/bstein/hecate.git}" +BRANCH="${HECATE_REPO_BRANCH:-main}" +REPO_DIR="${HECATE_REPO_DIR:-/opt/hecate}" + +mkdir -p "$(dirname "${REPO_DIR}")" +if [[ ! -d "${REPO_DIR}/.git" ]]; then + echo "[self-update] cloning ${REPO_URL} into ${REPO_DIR}" + git clone "${REPO_URL}" "${REPO_DIR}" +fi + +cd "${REPO_DIR}" +echo "[self-update] syncing ${BRANCH}" +git fetch origin --prune +git checkout "${BRANCH}" +git reset --hard "origin/${BRANCH}" + +echo "[self-update] running installer" +"${REPO_DIR}/scripts/install.sh" diff --git a/scripts/install.sh b/scripts/install.sh index 7a845c0..3a1ac01 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -11,7 +11,10 @@ BIN_DIR="/usr/local/bin" CONF_DIR="/etc/hecate" STATE_DIR="/var/lib/hecate" SYSTEMD_DIR="/etc/systemd/system" +LIB_DIR="/usr/local/lib/hecate" START_NOW=1 +INSTALL_DEPS=1 +ENABLE_BOOTSTRAP="${HECATE_ENABLE_BOOTSTRAP:-0}" while [[ $# -gt 0 ]]; do case "$1" in @@ -19,6 +22,10 @@ while [[ $# -gt 0 ]]; do START_NOW=0 shift ;; + --skip-deps) + INSTALL_DEPS=0 + shift + ;; *) echo "Unknown argument: $1" >&2 exit 1 @@ -26,6 +33,59 @@ while [[ $# -gt 0 ]]; do esac done +ensure_apt_packages() { + local missing=() + for pkg in "$@"; do + if ! dpkg -s "${pkg}" >/dev/null 2>&1; then + missing+=("${pkg}") + fi + done + if [[ ${#missing[@]} -eq 0 ]]; then + return 0 + fi + echo "[install] apt install: ${missing[*]}" + export DEBIAN_FRONTEND=noninteractive + apt-get update -y + apt-get install -y "${missing[@]}" +} + +install_kubectl_if_missing() { + if command -v kubectl >/dev/null 2>&1; then + return 0 + fi + ensure_apt_packages kubernetes-client || true + if command -v kubectl >/dev/null 2>&1; then + return 0 + fi + echo "[install] installing kubectl via upstream binary" + local arch + arch="$(uname -m)" + case "${arch}" in + x86_64) arch="amd64" ;; + aarch64|arm64) arch="arm64" ;; + *) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;; + esac + local version + version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)" + curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl" + chmod 0755 /usr/local/bin/kubectl +} + +ensure_dependencies() { + if [[ "${INSTALL_DEPS}" -eq 0 ]]; then + echo "[install] skipping dependency installation" + return 0 + fi + if ! command -v apt-get >/dev/null 2>&1; then + echo "This installer currently supports apt-based hosts only." >&2 + exit 1 + fi + ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go + install_kubectl_if_missing +} + +ensure_dependencies + echo "[install] building hecate" cd "${REPO_DIR}" mkdir -p dist @@ -38,6 +98,7 @@ install -m 0755 dist/hecate "${BIN_DIR}/hecate" echo "[install] installing config + state dirs" install -d -m 0750 "${CONF_DIR}" install -d -m 0750 "${STATE_DIR}" +install -d -m 0755 "${LIB_DIR}" if [[ ! -f "${CONF_DIR}/hecate.yaml" ]]; then install -m 0640 configs/hecate.example.yaml "${CONF_DIR}/hecate.yaml" echo "[install] wrote default config to ${CONF_DIR}/hecate.yaml" @@ -48,12 +109,21 @@ fi echo "[install] installing systemd units" install -m 0644 deploy/systemd/hecate.service "${SYSTEMD_DIR}/hecate.service" install -m 0644 deploy/systemd/hecate-bootstrap.service "${SYSTEMD_DIR}/hecate-bootstrap.service" +install -m 0644 deploy/systemd/hecate-update.service "${SYSTEMD_DIR}/hecate-update.service" +install -m 0644 deploy/systemd/hecate-update.timer "${SYSTEMD_DIR}/hecate-update.timer" +install -m 0755 scripts/hecate-self-update.sh "${LIB_DIR}/hecate-self-update.sh" systemctl daemon-reload -systemctl enable hecate.service hecate-bootstrap.service +systemctl enable hecate.service hecate-update.timer +if [[ "${ENABLE_BOOTSTRAP}" == "1" ]]; then + systemctl enable hecate-bootstrap.service +else + systemctl disable hecate-bootstrap.service >/dev/null 2>&1 || true +fi if [[ "${START_NOW}" -eq 1 ]]; then systemctl restart hecate.service + systemctl restart hecate-update.timer echo "[install] hecate.service restarted" fi @@ -62,4 +132,5 @@ echo "Next steps:" echo " 1. Edit /etc/hecate/hecate.yaml" echo " 2. Run: hecate status --config /etc/hecate/hecate.yaml" echo " 3. Test dry run: hecate startup --config /etc/hecate/hecate.yaml" -echo " 4. Trigger bootstrap now: systemctl start hecate-bootstrap.service" +echo " 4. Trigger bootstrap now (db host): systemctl start hecate-bootstrap.service" +echo " 5. Trigger self-update now: systemctl start hecate-update.service"