recovery(ananke): trigger earlier on battery outages

This commit is contained in:
codex 2026-05-05 10:53:00 -03:00
parent 1f656de5df
commit b7f7486350
8 changed files with 162 additions and 5 deletions

View File

@ -175,6 +175,7 @@ ups:
target: pyrphoros@localhost target: pyrphoros@localhost
poll_seconds: 5 poll_seconds: 5
runtime_safety_factor: 1.25 runtime_safety_factor: 1.25
on_battery_grace_seconds: 90
debounce_count: 3 debounce_count: 3
telemetry_timeout_seconds: 90 telemetry_timeout_seconds: 90
coordination: coordination:

View File

@ -122,7 +122,47 @@ startup:
- titan-0b - titan-0b
- titan-0c - titan-0c
required_node_labels: required_node_labels:
titan-04:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-05:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-06:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-07:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-08:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-11:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-12:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-13:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-14:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-15:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-17:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-18:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-19:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-09: titan-09:
node-role.kubernetes.io/worker: "true"
ananke.bstein.dev/harbor-bootstrap: "true" ananke.bstein.dev/harbor-bootstrap: "true"
require_time_sync: true require_time_sync: true
time_sync_wait_seconds: 240 time_sync_wait_seconds: 240
@ -266,6 +306,7 @@ ups:
target: statera@localhost target: statera@localhost
poll_seconds: 5 poll_seconds: 5
runtime_safety_factor: 1.25 runtime_safety_factor: 1.25
on_battery_grace_seconds: 90
debounce_count: 3 debounce_count: 3
telemetry_timeout_seconds: 90 telemetry_timeout_seconds: 90
coordination: coordination:

View File

@ -122,7 +122,47 @@ startup:
- titan-0b - titan-0b
- titan-0c - titan-0c
required_node_labels: required_node_labels:
titan-04:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-05:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-06:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-07:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-08:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-11:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-12:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-13:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-14:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-15:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-17:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-18:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-19:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-09: titan-09:
node-role.kubernetes.io/worker: "true"
ananke.bstein.dev/harbor-bootstrap: "true" ananke.bstein.dev/harbor-bootstrap: "true"
require_time_sync: true require_time_sync: true
time_sync_wait_seconds: 240 time_sync_wait_seconds: 240
@ -266,6 +306,7 @@ ups:
target: pyrphoros@localhost target: pyrphoros@localhost
poll_seconds: 5 poll_seconds: 5
runtime_safety_factor: 1.25 runtime_safety_factor: 1.25
on_battery_grace_seconds: 90
debounce_count: 3 debounce_count: 3
telemetry_timeout_seconds: 90 telemetry_timeout_seconds: 90
coordination: coordination:

View File

@ -141,6 +141,7 @@ type UPS struct {
Targets []UPSTarget `yaml:"targets"` Targets []UPSTarget `yaml:"targets"`
PollSeconds int `yaml:"poll_seconds"` PollSeconds int `yaml:"poll_seconds"`
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"` RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
OnBatteryGraceSeconds int `yaml:"on_battery_grace_seconds"`
DebounceCount int `yaml:"debounce_count"` DebounceCount int `yaml:"debounce_count"`
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"` TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
} }

View File

@ -326,6 +326,9 @@ func (c Config) Validate() error {
if c.UPS.Provider == "" { if c.UPS.Provider == "" {
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled") return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
} }
if c.UPS.OnBatteryGraceSeconds < 0 {
return fmt.Errorf("config.ups.on_battery_grace_seconds must be >= 0")
}
if c.UPS.Target == "" && len(c.UPS.Targets) == 0 { if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled") return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
} }

View File

@ -94,6 +94,7 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }}, {"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }}, {"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
{"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }}, {"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }},
{"bad_ups_on_battery_grace_negative", func(c *Config) { c.UPS.Enabled = true; c.UPS.OnBatteryGraceSeconds = -1 }},
{"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }}, {"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }},
{"bad_ups_targets_item_empty", func(c *Config) { {"bad_ups_targets_item_empty", func(c *Config) {
c.UPS.Enabled = true c.UPS.Enabled = true

View File

@ -92,6 +92,7 @@ func (d *Daemon) Run(ctx context.Context) error {
lastGood := map[string]time.Time{} lastGood := map[string]time.Time{}
lastOnBattery := map[string]bool{} lastOnBattery := map[string]bool{}
onBatterySince := map[string]time.Time{}
breachCount := map[string]int{} breachCount := map[string]int{}
for _, t := range d.targets { for _, t := range d.targets {
lastGood[t.Name] = time.Now() lastGood[t.Name] = time.Now()
@ -131,17 +132,42 @@ func (d *Daemon) Run(ctx context.Context) error {
} }
lastGood[target.Name] = time.Now() lastGood[target.Name] = time.Now()
wasOnBattery := lastOnBattery[target.Name]
if sample.OnBattery {
if !wasOnBattery || onBatterySince[target.Name].IsZero() {
onBatterySince[target.Name] = time.Now()
}
} else {
onBatterySince[target.Name] = time.Time{}
}
lastOnBattery[target.Name] = sample.OnBattery lastOnBattery[target.Name] = sample.OnBattery
trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold) onBatteryElapsed := 0
if sample.OnBattery && !onBatterySince[target.Name].IsZero() {
onBatteryElapsed = int(time.Since(onBatterySince[target.Name]).Seconds())
}
trigger := false
triggerReason := ""
switch {
case sample.LowBattery:
trigger = true
triggerReason = fmt.Sprintf("ups-low-battery target=%s status=%s", target.Name, sample.RawStatus)
case sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold:
trigger = true
triggerReason = fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
case sample.OnBattery && d.cfg.UPS.OnBatteryGraceSeconds > 0 && onBatteryElapsed >= d.cfg.UPS.OnBatteryGraceSeconds:
trigger = true
triggerReason = fmt.Sprintf("ups-on-battery target=%s elapsed=%ds grace=%ds status=%s", target.Name, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RawStatus)
}
if trigger { if trigger {
breachCount[target.Name]++ breachCount[target.Name]++
} else { } else {
breachCount[target.Name] = 0 breachCount[target.Name] = 0
} }
d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d", d.log.Printf("ups target=%s status=%s on_battery=%t on_battery_s=%d grace_s=%d runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name]) target.Name, sample.RawStatus, sample.OnBattery, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
d.exporter.UpdateSample(metrics.Sample{ d.exporter.UpdateSample(metrics.Sample{
Name: target.Name, Name: target.Name,
@ -160,8 +186,7 @@ func (d *Daemon) Run(ctx context.Context) error {
}) })
if breachCount[target.Name] >= debounce { if breachCount[target.Name] >= debounce {
reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus) return d.triggerShutdown(ctx, triggerReason)
return d.triggerShutdown(ctx, reason)
} }
} }
} }

View File

@ -165,6 +165,50 @@ func TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T) {
} }
} }
// TestDaemonRunTriggersShutdownAfterOnBatteryGrace runs one orchestration or CLI step.
// Signature: TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T).
// Why: covers the sustained-on-battery trigger so short runtime estimates are not
// the only path to a graceful shutdown during abrupt power loss.
func TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T) {
stateDir := t.TempDir()
orch := newDaemonTestOrchestrator(t, stateDir)
d := &Daemon{
cfg: config.Config{
UPS: config.UPS{
Enabled: true,
PollSeconds: 1,
DebounceCount: 1,
RuntimeSafetyFactor: 1.0,
OnBatteryGraceSeconds: 1,
},
State: config.State{
IntentPath: filepath.Join(stateDir, "intent.json"),
},
Shutdown: config.Shutdown{
EmergencySkipDrain: true,
EmergencySkipEtcd: true,
},
},
orch: orch,
targets: []Target{
{
Name: "Pyrphoros",
Target: "pyrphoros@localhost",
Provider: &daemonFakeProvider{
samples: []ups.Sample{{OnBattery: true, LowBattery: false, RuntimeSeconds: 9999, RawStatus: "OB"}},
},
},
},
log: log.New(io.Discard, "", 0),
exporter: metrics.New(),
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := d.Run(ctx); err != nil {
t.Fatalf("expected sustained-on-battery shutdown path to complete, got %v", err)
}
}
// TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step. // TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step.
// Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T). // Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T).
// Why: covers forward-shutdown SSH execution path. // Why: covers forward-shutdown SSH execution path.