diff --git a/configs/ananke.example.yaml b/configs/ananke.example.yaml index e12a1bf..1472e48 100644 --- a/configs/ananke.example.yaml +++ b/configs/ananke.example.yaml @@ -175,6 +175,7 @@ ups: target: pyrphoros@localhost poll_seconds: 5 runtime_safety_factor: 1.25 + on_battery_grace_seconds: 90 debounce_count: 3 telemetry_timeout_seconds: 90 coordination: diff --git a/configs/ananke.tethys.yaml b/configs/ananke.tethys.yaml index e8dfcba..2c08b0e 100644 --- a/configs/ananke.tethys.yaml +++ b/configs/ananke.tethys.yaml @@ -122,7 +122,47 @@ startup: - titan-0b - titan-0c required_node_labels: + titan-04: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-05: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-06: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-07: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-08: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-11: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-12: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-13: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-14: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-15: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-17: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-18: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-19: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" titan-09: + node-role.kubernetes.io/worker: "true" ananke.bstein.dev/harbor-bootstrap: "true" require_time_sync: true time_sync_wait_seconds: 240 @@ -266,6 +306,7 @@ ups: target: statera@localhost poll_seconds: 5 runtime_safety_factor: 1.25 + on_battery_grace_seconds: 90 debounce_count: 3 telemetry_timeout_seconds: 90 coordination: diff --git a/configs/ananke.titan-db.yaml b/configs/ananke.titan-db.yaml index 680c7ac..bd61fb6 100644 --- a/configs/ananke.titan-db.yaml +++ b/configs/ananke.titan-db.yaml @@ -122,7 +122,47 @@ startup: - titan-0b - titan-0c required_node_labels: + titan-04: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-05: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-06: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-07: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-08: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-11: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-12: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-13: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-14: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-15: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-17: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-18: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" + titan-19: + node-role.kubernetes.io/worker: "true" + longhorn-host: "true" titan-09: + node-role.kubernetes.io/worker: "true" ananke.bstein.dev/harbor-bootstrap: "true" require_time_sync: true time_sync_wait_seconds: 240 @@ -266,6 +306,7 @@ ups: target: pyrphoros@localhost poll_seconds: 5 runtime_safety_factor: 1.25 + on_battery_grace_seconds: 90 debounce_count: 3 telemetry_timeout_seconds: 90 coordination: diff --git a/internal/config/types.go b/internal/config/types.go index 1ae797c..cd8eea1 100644 --- a/internal/config/types.go +++ b/internal/config/types.go @@ -141,6 +141,7 @@ type UPS struct { Targets []UPSTarget `yaml:"targets"` PollSeconds int `yaml:"poll_seconds"` RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"` + OnBatteryGraceSeconds int `yaml:"on_battery_grace_seconds"` DebounceCount int `yaml:"debounce_count"` TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"` } diff --git a/internal/config/validate.go b/internal/config/validate.go index e1123e9..1db2690 100644 --- a/internal/config/validate.go +++ b/internal/config/validate.go @@ -326,6 +326,9 @@ func (c Config) Validate() error { if c.UPS.Provider == "" { return fmt.Errorf("config.ups.provider must not be empty when ups is enabled") } + if c.UPS.OnBatteryGraceSeconds < 0 { + return fmt.Errorf("config.ups.on_battery_grace_seconds must be >= 0") + } if c.UPS.Target == "" && len(c.UPS.Targets) == 0 { return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled") } diff --git a/internal/config/validate_matrix_test.go b/internal/config/validate_matrix_test.go index 1d746f9..ff01346 100644 --- a/internal/config/validate_matrix_test.go +++ b/internal/config/validate_matrix_test.go @@ -94,6 +94,7 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) { {"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }}, {"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }}, {"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }}, + {"bad_ups_on_battery_grace_negative", func(c *Config) { c.UPS.Enabled = true; c.UPS.OnBatteryGraceSeconds = -1 }}, {"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }}, {"bad_ups_targets_item_empty", func(c *Config) { c.UPS.Enabled = true diff --git a/internal/service/daemon.go b/internal/service/daemon.go index 8649aa1..0c48fb9 100644 --- a/internal/service/daemon.go +++ b/internal/service/daemon.go @@ -92,6 +92,7 @@ func (d *Daemon) Run(ctx context.Context) error { lastGood := map[string]time.Time{} lastOnBattery := map[string]bool{} + onBatterySince := map[string]time.Time{} breachCount := map[string]int{} for _, t := range d.targets { lastGood[t.Name] = time.Now() @@ -131,17 +132,42 @@ func (d *Daemon) Run(ctx context.Context) error { } lastGood[target.Name] = time.Now() + wasOnBattery := lastOnBattery[target.Name] + if sample.OnBattery { + if !wasOnBattery || onBatterySince[target.Name].IsZero() { + onBatterySince[target.Name] = time.Now() + } + } else { + onBatterySince[target.Name] = time.Time{} + } lastOnBattery[target.Name] = sample.OnBattery - trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold) + onBatteryElapsed := 0 + if sample.OnBattery && !onBatterySince[target.Name].IsZero() { + onBatteryElapsed = int(time.Since(onBatterySince[target.Name]).Seconds()) + } + + trigger := false + triggerReason := "" + switch { + case sample.LowBattery: + trigger = true + triggerReason = fmt.Sprintf("ups-low-battery target=%s status=%s", target.Name, sample.RawStatus) + case sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold: + trigger = true + triggerReason = fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus) + case sample.OnBattery && d.cfg.UPS.OnBatteryGraceSeconds > 0 && onBatteryElapsed >= d.cfg.UPS.OnBatteryGraceSeconds: + trigger = true + triggerReason = fmt.Sprintf("ups-on-battery target=%s elapsed=%ds grace=%ds status=%s", target.Name, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RawStatus) + } if trigger { breachCount[target.Name]++ } else { breachCount[target.Name] = 0 } - d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d", - target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name]) + d.log.Printf("ups target=%s status=%s on_battery=%t on_battery_s=%d grace_s=%d runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d", + target.Name, sample.RawStatus, sample.OnBattery, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name]) d.exporter.UpdateSample(metrics.Sample{ Name: target.Name, @@ -160,8 +186,7 @@ func (d *Daemon) Run(ctx context.Context) error { }) if breachCount[target.Name] >= debounce { - reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus) - return d.triggerShutdown(ctx, reason) + return d.triggerShutdown(ctx, triggerReason) } } } diff --git a/internal/service/daemon_additional_test.go b/internal/service/daemon_additional_test.go index acd5c76..f6c2c6a 100644 --- a/internal/service/daemon_additional_test.go +++ b/internal/service/daemon_additional_test.go @@ -165,6 +165,50 @@ func TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T) { } } +// TestDaemonRunTriggersShutdownAfterOnBatteryGrace runs one orchestration or CLI step. +// Signature: TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T). +// Why: covers the sustained-on-battery trigger so short runtime estimates are not +// the only path to a graceful shutdown during abrupt power loss. +func TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T) { + stateDir := t.TempDir() + orch := newDaemonTestOrchestrator(t, stateDir) + d := &Daemon{ + cfg: config.Config{ + UPS: config.UPS{ + Enabled: true, + PollSeconds: 1, + DebounceCount: 1, + RuntimeSafetyFactor: 1.0, + OnBatteryGraceSeconds: 1, + }, + State: config.State{ + IntentPath: filepath.Join(stateDir, "intent.json"), + }, + Shutdown: config.Shutdown{ + EmergencySkipDrain: true, + EmergencySkipEtcd: true, + }, + }, + orch: orch, + targets: []Target{ + { + Name: "Pyrphoros", + Target: "pyrphoros@localhost", + Provider: &daemonFakeProvider{ + samples: []ups.Sample{{OnBattery: true, LowBattery: false, RuntimeSeconds: 9999, RawStatus: "OB"}}, + }, + }, + }, + log: log.New(io.Discard, "", 0), + exporter: metrics.New(), + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := d.Run(ctx); err != nil { + t.Fatalf("expected sustained-on-battery shutdown path to complete, got %v", err) + } +} + // TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step. // Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T). // Why: covers forward-shutdown SSH execution path.