recovery(ananke): trigger earlier on battery outages
This commit is contained in:
parent
1f656de5df
commit
b7f7486350
@ -175,6 +175,7 @@ ups:
|
||||
target: pyrphoros@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.25
|
||||
on_battery_grace_seconds: 90
|
||||
debounce_count: 3
|
||||
telemetry_timeout_seconds: 90
|
||||
coordination:
|
||||
|
||||
@ -122,7 +122,47 @@ startup:
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
required_node_labels:
|
||||
titan-04:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-05:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-06:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-07:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-08:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-11:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-12:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-13:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-14:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-15:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-17:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-18:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-19:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-09:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
require_time_sync: true
|
||||
time_sync_wait_seconds: 240
|
||||
@ -266,6 +306,7 @@ ups:
|
||||
target: statera@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.25
|
||||
on_battery_grace_seconds: 90
|
||||
debounce_count: 3
|
||||
telemetry_timeout_seconds: 90
|
||||
coordination:
|
||||
|
||||
@ -122,7 +122,47 @@ startup:
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
required_node_labels:
|
||||
titan-04:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-05:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-06:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-07:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-08:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-11:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-12:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-13:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-14:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-15:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-17:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-18:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-19:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
longhorn-host: "true"
|
||||
titan-09:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
require_time_sync: true
|
||||
time_sync_wait_seconds: 240
|
||||
@ -266,6 +306,7 @@ ups:
|
||||
target: pyrphoros@localhost
|
||||
poll_seconds: 5
|
||||
runtime_safety_factor: 1.25
|
||||
on_battery_grace_seconds: 90
|
||||
debounce_count: 3
|
||||
telemetry_timeout_seconds: 90
|
||||
coordination:
|
||||
|
||||
@ -141,6 +141,7 @@ type UPS struct {
|
||||
Targets []UPSTarget `yaml:"targets"`
|
||||
PollSeconds int `yaml:"poll_seconds"`
|
||||
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
|
||||
OnBatteryGraceSeconds int `yaml:"on_battery_grace_seconds"`
|
||||
DebounceCount int `yaml:"debounce_count"`
|
||||
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
|
||||
}
|
||||
|
||||
@ -326,6 +326,9 @@ func (c Config) Validate() error {
|
||||
if c.UPS.Provider == "" {
|
||||
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
|
||||
}
|
||||
if c.UPS.OnBatteryGraceSeconds < 0 {
|
||||
return fmt.Errorf("config.ups.on_battery_grace_seconds must be >= 0")
|
||||
}
|
||||
if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
|
||||
return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
|
||||
}
|
||||
|
||||
@ -94,6 +94,7 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
|
||||
{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
|
||||
{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
|
||||
{"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }},
|
||||
{"bad_ups_on_battery_grace_negative", func(c *Config) { c.UPS.Enabled = true; c.UPS.OnBatteryGraceSeconds = -1 }},
|
||||
{"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }},
|
||||
{"bad_ups_targets_item_empty", func(c *Config) {
|
||||
c.UPS.Enabled = true
|
||||
|
||||
@ -92,6 +92,7 @@ func (d *Daemon) Run(ctx context.Context) error {
|
||||
|
||||
lastGood := map[string]time.Time{}
|
||||
lastOnBattery := map[string]bool{}
|
||||
onBatterySince := map[string]time.Time{}
|
||||
breachCount := map[string]int{}
|
||||
for _, t := range d.targets {
|
||||
lastGood[t.Name] = time.Now()
|
||||
@ -131,17 +132,42 @@ func (d *Daemon) Run(ctx context.Context) error {
|
||||
}
|
||||
|
||||
lastGood[target.Name] = time.Now()
|
||||
wasOnBattery := lastOnBattery[target.Name]
|
||||
if sample.OnBattery {
|
||||
if !wasOnBattery || onBatterySince[target.Name].IsZero() {
|
||||
onBatterySince[target.Name] = time.Now()
|
||||
}
|
||||
} else {
|
||||
onBatterySince[target.Name] = time.Time{}
|
||||
}
|
||||
lastOnBattery[target.Name] = sample.OnBattery
|
||||
|
||||
trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold)
|
||||
onBatteryElapsed := 0
|
||||
if sample.OnBattery && !onBatterySince[target.Name].IsZero() {
|
||||
onBatteryElapsed = int(time.Since(onBatterySince[target.Name]).Seconds())
|
||||
}
|
||||
|
||||
trigger := false
|
||||
triggerReason := ""
|
||||
switch {
|
||||
case sample.LowBattery:
|
||||
trigger = true
|
||||
triggerReason = fmt.Sprintf("ups-low-battery target=%s status=%s", target.Name, sample.RawStatus)
|
||||
case sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold:
|
||||
trigger = true
|
||||
triggerReason = fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
|
||||
case sample.OnBattery && d.cfg.UPS.OnBatteryGraceSeconds > 0 && onBatteryElapsed >= d.cfg.UPS.OnBatteryGraceSeconds:
|
||||
trigger = true
|
||||
triggerReason = fmt.Sprintf("ups-on-battery target=%s elapsed=%ds grace=%ds status=%s", target.Name, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RawStatus)
|
||||
}
|
||||
if trigger {
|
||||
breachCount[target.Name]++
|
||||
} else {
|
||||
breachCount[target.Name] = 0
|
||||
}
|
||||
|
||||
d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
|
||||
target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
|
||||
d.log.Printf("ups target=%s status=%s on_battery=%t on_battery_s=%d grace_s=%d runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
|
||||
target.Name, sample.RawStatus, sample.OnBattery, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
|
||||
|
||||
d.exporter.UpdateSample(metrics.Sample{
|
||||
Name: target.Name,
|
||||
@ -160,8 +186,7 @@ func (d *Daemon) Run(ctx context.Context) error {
|
||||
})
|
||||
|
||||
if breachCount[target.Name] >= debounce {
|
||||
reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
|
||||
return d.triggerShutdown(ctx, reason)
|
||||
return d.triggerShutdown(ctx, triggerReason)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -165,6 +165,50 @@ func TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestDaemonRunTriggersShutdownAfterOnBatteryGrace runs one orchestration or CLI step.
|
||||
// Signature: TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T).
|
||||
// Why: covers the sustained-on-battery trigger so short runtime estimates are not
|
||||
// the only path to a graceful shutdown during abrupt power loss.
|
||||
func TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T) {
|
||||
stateDir := t.TempDir()
|
||||
orch := newDaemonTestOrchestrator(t, stateDir)
|
||||
d := &Daemon{
|
||||
cfg: config.Config{
|
||||
UPS: config.UPS{
|
||||
Enabled: true,
|
||||
PollSeconds: 1,
|
||||
DebounceCount: 1,
|
||||
RuntimeSafetyFactor: 1.0,
|
||||
OnBatteryGraceSeconds: 1,
|
||||
},
|
||||
State: config.State{
|
||||
IntentPath: filepath.Join(stateDir, "intent.json"),
|
||||
},
|
||||
Shutdown: config.Shutdown{
|
||||
EmergencySkipDrain: true,
|
||||
EmergencySkipEtcd: true,
|
||||
},
|
||||
},
|
||||
orch: orch,
|
||||
targets: []Target{
|
||||
{
|
||||
Name: "Pyrphoros",
|
||||
Target: "pyrphoros@localhost",
|
||||
Provider: &daemonFakeProvider{
|
||||
samples: []ups.Sample{{OnBattery: true, LowBattery: false, RuntimeSeconds: 9999, RawStatus: "OB"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
log: log.New(io.Discard, "", 0),
|
||||
exporter: metrics.New(),
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := d.Run(ctx); err != nil {
|
||||
t.Fatalf("expected sustained-on-battery shutdown path to complete, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step.
|
||||
// Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T).
|
||||
// Why: covers forward-shutdown SSH execution path.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user