recovery(ananke): trigger earlier on battery outages
This commit is contained in:
parent
1f656de5df
commit
b7f7486350
@ -175,6 +175,7 @@ ups:
|
|||||||
target: pyrphoros@localhost
|
target: pyrphoros@localhost
|
||||||
poll_seconds: 5
|
poll_seconds: 5
|
||||||
runtime_safety_factor: 1.25
|
runtime_safety_factor: 1.25
|
||||||
|
on_battery_grace_seconds: 90
|
||||||
debounce_count: 3
|
debounce_count: 3
|
||||||
telemetry_timeout_seconds: 90
|
telemetry_timeout_seconds: 90
|
||||||
coordination:
|
coordination:
|
||||||
|
|||||||
@ -122,7 +122,47 @@ startup:
|
|||||||
- titan-0b
|
- titan-0b
|
||||||
- titan-0c
|
- titan-0c
|
||||||
required_node_labels:
|
required_node_labels:
|
||||||
|
titan-04:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-05:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-06:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-07:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-08:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-11:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-12:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-13:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-14:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-15:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-17:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-18:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-19:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
titan-09:
|
titan-09:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
require_time_sync: true
|
require_time_sync: true
|
||||||
time_sync_wait_seconds: 240
|
time_sync_wait_seconds: 240
|
||||||
@ -266,6 +306,7 @@ ups:
|
|||||||
target: statera@localhost
|
target: statera@localhost
|
||||||
poll_seconds: 5
|
poll_seconds: 5
|
||||||
runtime_safety_factor: 1.25
|
runtime_safety_factor: 1.25
|
||||||
|
on_battery_grace_seconds: 90
|
||||||
debounce_count: 3
|
debounce_count: 3
|
||||||
telemetry_timeout_seconds: 90
|
telemetry_timeout_seconds: 90
|
||||||
coordination:
|
coordination:
|
||||||
|
|||||||
@ -122,7 +122,47 @@ startup:
|
|||||||
- titan-0b
|
- titan-0b
|
||||||
- titan-0c
|
- titan-0c
|
||||||
required_node_labels:
|
required_node_labels:
|
||||||
|
titan-04:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-05:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-06:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-07:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-08:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-11:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-12:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-13:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-14:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-15:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-17:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-18:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
|
titan-19:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
longhorn-host: "true"
|
||||||
titan-09:
|
titan-09:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
require_time_sync: true
|
require_time_sync: true
|
||||||
time_sync_wait_seconds: 240
|
time_sync_wait_seconds: 240
|
||||||
@ -266,6 +306,7 @@ ups:
|
|||||||
target: pyrphoros@localhost
|
target: pyrphoros@localhost
|
||||||
poll_seconds: 5
|
poll_seconds: 5
|
||||||
runtime_safety_factor: 1.25
|
runtime_safety_factor: 1.25
|
||||||
|
on_battery_grace_seconds: 90
|
||||||
debounce_count: 3
|
debounce_count: 3
|
||||||
telemetry_timeout_seconds: 90
|
telemetry_timeout_seconds: 90
|
||||||
coordination:
|
coordination:
|
||||||
|
|||||||
@ -141,6 +141,7 @@ type UPS struct {
|
|||||||
Targets []UPSTarget `yaml:"targets"`
|
Targets []UPSTarget `yaml:"targets"`
|
||||||
PollSeconds int `yaml:"poll_seconds"`
|
PollSeconds int `yaml:"poll_seconds"`
|
||||||
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
|
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
|
||||||
|
OnBatteryGraceSeconds int `yaml:"on_battery_grace_seconds"`
|
||||||
DebounceCount int `yaml:"debounce_count"`
|
DebounceCount int `yaml:"debounce_count"`
|
||||||
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
|
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
|
||||||
}
|
}
|
||||||
|
|||||||
@ -326,6 +326,9 @@ func (c Config) Validate() error {
|
|||||||
if c.UPS.Provider == "" {
|
if c.UPS.Provider == "" {
|
||||||
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
|
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
|
||||||
}
|
}
|
||||||
|
if c.UPS.OnBatteryGraceSeconds < 0 {
|
||||||
|
return fmt.Errorf("config.ups.on_battery_grace_seconds must be >= 0")
|
||||||
|
}
|
||||||
if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
|
if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
|
||||||
return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
|
return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
|
||||||
}
|
}
|
||||||
|
|||||||
@ -94,6 +94,7 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
|
|||||||
{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
|
{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
|
||||||
{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
|
{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
|
||||||
{"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }},
|
{"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }},
|
||||||
|
{"bad_ups_on_battery_grace_negative", func(c *Config) { c.UPS.Enabled = true; c.UPS.OnBatteryGraceSeconds = -1 }},
|
||||||
{"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }},
|
{"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }},
|
||||||
{"bad_ups_targets_item_empty", func(c *Config) {
|
{"bad_ups_targets_item_empty", func(c *Config) {
|
||||||
c.UPS.Enabled = true
|
c.UPS.Enabled = true
|
||||||
|
|||||||
@ -92,6 +92,7 @@ func (d *Daemon) Run(ctx context.Context) error {
|
|||||||
|
|
||||||
lastGood := map[string]time.Time{}
|
lastGood := map[string]time.Time{}
|
||||||
lastOnBattery := map[string]bool{}
|
lastOnBattery := map[string]bool{}
|
||||||
|
onBatterySince := map[string]time.Time{}
|
||||||
breachCount := map[string]int{}
|
breachCount := map[string]int{}
|
||||||
for _, t := range d.targets {
|
for _, t := range d.targets {
|
||||||
lastGood[t.Name] = time.Now()
|
lastGood[t.Name] = time.Now()
|
||||||
@ -131,17 +132,42 @@ func (d *Daemon) Run(ctx context.Context) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
lastGood[target.Name] = time.Now()
|
lastGood[target.Name] = time.Now()
|
||||||
|
wasOnBattery := lastOnBattery[target.Name]
|
||||||
|
if sample.OnBattery {
|
||||||
|
if !wasOnBattery || onBatterySince[target.Name].IsZero() {
|
||||||
|
onBatterySince[target.Name] = time.Now()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
onBatterySince[target.Name] = time.Time{}
|
||||||
|
}
|
||||||
lastOnBattery[target.Name] = sample.OnBattery
|
lastOnBattery[target.Name] = sample.OnBattery
|
||||||
|
|
||||||
trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold)
|
onBatteryElapsed := 0
|
||||||
|
if sample.OnBattery && !onBatterySince[target.Name].IsZero() {
|
||||||
|
onBatteryElapsed = int(time.Since(onBatterySince[target.Name]).Seconds())
|
||||||
|
}
|
||||||
|
|
||||||
|
trigger := false
|
||||||
|
triggerReason := ""
|
||||||
|
switch {
|
||||||
|
case sample.LowBattery:
|
||||||
|
trigger = true
|
||||||
|
triggerReason = fmt.Sprintf("ups-low-battery target=%s status=%s", target.Name, sample.RawStatus)
|
||||||
|
case sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold:
|
||||||
|
trigger = true
|
||||||
|
triggerReason = fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
|
||||||
|
case sample.OnBattery && d.cfg.UPS.OnBatteryGraceSeconds > 0 && onBatteryElapsed >= d.cfg.UPS.OnBatteryGraceSeconds:
|
||||||
|
trigger = true
|
||||||
|
triggerReason = fmt.Sprintf("ups-on-battery target=%s elapsed=%ds grace=%ds status=%s", target.Name, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RawStatus)
|
||||||
|
}
|
||||||
if trigger {
|
if trigger {
|
||||||
breachCount[target.Name]++
|
breachCount[target.Name]++
|
||||||
} else {
|
} else {
|
||||||
breachCount[target.Name] = 0
|
breachCount[target.Name] = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
|
d.log.Printf("ups target=%s status=%s on_battery=%t on_battery_s=%d grace_s=%d runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
|
||||||
target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
|
target.Name, sample.RawStatus, sample.OnBattery, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
|
||||||
|
|
||||||
d.exporter.UpdateSample(metrics.Sample{
|
d.exporter.UpdateSample(metrics.Sample{
|
||||||
Name: target.Name,
|
Name: target.Name,
|
||||||
@ -160,8 +186,7 @@ func (d *Daemon) Run(ctx context.Context) error {
|
|||||||
})
|
})
|
||||||
|
|
||||||
if breachCount[target.Name] >= debounce {
|
if breachCount[target.Name] >= debounce {
|
||||||
reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
|
return d.triggerShutdown(ctx, triggerReason)
|
||||||
return d.triggerShutdown(ctx, reason)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -165,6 +165,50 @@ func TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestDaemonRunTriggersShutdownAfterOnBatteryGrace runs one orchestration or CLI step.
|
||||||
|
// Signature: TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T).
|
||||||
|
// Why: covers the sustained-on-battery trigger so short runtime estimates are not
|
||||||
|
// the only path to a graceful shutdown during abrupt power loss.
|
||||||
|
func TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T) {
|
||||||
|
stateDir := t.TempDir()
|
||||||
|
orch := newDaemonTestOrchestrator(t, stateDir)
|
||||||
|
d := &Daemon{
|
||||||
|
cfg: config.Config{
|
||||||
|
UPS: config.UPS{
|
||||||
|
Enabled: true,
|
||||||
|
PollSeconds: 1,
|
||||||
|
DebounceCount: 1,
|
||||||
|
RuntimeSafetyFactor: 1.0,
|
||||||
|
OnBatteryGraceSeconds: 1,
|
||||||
|
},
|
||||||
|
State: config.State{
|
||||||
|
IntentPath: filepath.Join(stateDir, "intent.json"),
|
||||||
|
},
|
||||||
|
Shutdown: config.Shutdown{
|
||||||
|
EmergencySkipDrain: true,
|
||||||
|
EmergencySkipEtcd: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
orch: orch,
|
||||||
|
targets: []Target{
|
||||||
|
{
|
||||||
|
Name: "Pyrphoros",
|
||||||
|
Target: "pyrphoros@localhost",
|
||||||
|
Provider: &daemonFakeProvider{
|
||||||
|
samples: []ups.Sample{{OnBattery: true, LowBattery: false, RuntimeSeconds: 9999, RawStatus: "OB"}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
log: log.New(io.Discard, "", 0),
|
||||||
|
exporter: metrics.New(),
|
||||||
|
}
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
if err := d.Run(ctx); err != nil {
|
||||||
|
t.Fatalf("expected sustained-on-battery shutdown path to complete, got %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step.
|
// TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step.
|
||||||
// Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T).
|
// Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T).
|
||||||
// Why: covers forward-shutdown SSH execution path.
|
// Why: covers forward-shutdown SSH execution path.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user