fix: make blind healing opt in
This commit is contained in:
parent
f0f204b777
commit
060e09336e
18
AGENTS.md
18
AGENTS.md
@ -594,5 +594,21 @@ camera playout loop woke up after waiting for the audio master.
|
||||
- [x] Keep polling and timing inbound camera packets while video waits for its due time.
|
||||
- [x] Coalesce pending video to the freshest packet during those waits so the server does not build a stale video backlog.
|
||||
- [x] Add regression coverage that video timing is recorded at enqueue/drain time before scheduler waits.
|
||||
- [ ] Re-run the probe-calibrate-confirm flow; `planner_server_receive_abs_skew_p95_ms` should fall if this was the receive-side scheduling leak.
|
||||
- [x] Re-run the probe-calibrate-confirm flow; `planner_server_receive_abs_skew_p95_ms` should fall if this was the receive-side scheduling leak.
|
||||
- [ ] If receive p95 remains high after this, inspect actual gRPC/HTTP2 stream delivery and OS/network scheduling rather than static calibration.
|
||||
|
||||
## 0.17.32 Blind Heal Opt-In And Stability Checklist
|
||||
|
||||
Context: the 0.17.31 mirrored run confirmed the receive-side drain worked: client send and server
|
||||
receive p95 both stayed near 50ms. The run still failed as probe pairing, and the server-side blind
|
||||
healer silently changed calibration during the probe run because it was enabled by default and allowed
|
||||
sink handoff p95 near 240ms.
|
||||
|
||||
- [x] Treat the 0.17.31 run as confirmation that the server receive scheduling leak is fixed.
|
||||
- [x] Default runtime blind healing to disabled so probe-calibration runs cannot be contaminated by hidden server nudges.
|
||||
- [x] Require explicit server-side `LESAVKA_UPSTREAM_BLIND_HEAL=1` before blind healing mutates transient calibration.
|
||||
- [x] Tighten the blind-heal sink handoff p95 gate from 250ms to 120ms before applying runtime nudges.
|
||||
- [x] Align mirrored-run root-cause summaries with the stricter sink handoff stability threshold.
|
||||
- [x] Add regression coverage for default-disabled blind healing and noisy sink-handoff refusal.
|
||||
- [ ] Re-run the normal probe-calibrate-confirm flow; `calibration_source` should remain non-blind unless the server was explicitly started with blind healing.
|
||||
- [ ] If the probe still produces only one or two visual events while blind metrics stay stable, move the next fix to stimulus/browser/probe detection instead of transport timing.
|
||||
|
||||
6
Cargo.lock
generated
6
Cargo.lock
generated
@ -1652,7 +1652,7 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
|
||||
|
||||
[[package]]
|
||||
name = "lesavka_client"
|
||||
version = "0.17.31"
|
||||
version = "0.17.32"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
@ -1686,7 +1686,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lesavka_common"
|
||||
version = "0.17.31"
|
||||
version = "0.17.32"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"base64",
|
||||
@ -1698,7 +1698,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lesavka_server"
|
||||
version = "0.17.31"
|
||||
version = "0.17.32"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"base64",
|
||||
|
||||
@ -4,7 +4,7 @@ path = "src/main.rs"
|
||||
|
||||
[package]
|
||||
name = "lesavka_client"
|
||||
version = "0.17.31"
|
||||
version = "0.17.32"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lesavka_common"
|
||||
version = "0.17.31"
|
||||
version = "0.17.32"
|
||||
edition = "2024"
|
||||
build = "build.rs"
|
||||
|
||||
|
||||
@ -95,6 +95,10 @@ export LESAVKA_SYNC_CONFIRM_AFTER_CALIBRATION
|
||||
export LESAVKA_SYNC_CONFIRMATION_SEGMENTS
|
||||
export LESAVKA_SYNC_REQUIRE_CONFIRMATION_PASS
|
||||
|
||||
if [[ "${LESAVKA_UPSTREAM_BLIND_HEAL:-0}" == "1" && "${LESAVKA_SERVER_ADDR}" == "auto" ]]; then
|
||||
echo "==> note: LESAVKA_UPSTREAM_BLIND_HEAL is server-side; set it where lesavka-server runs to enable blind healing"
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
set +e
|
||||
[[ -n "${CLIENT_PID}" ]] && kill "${CLIENT_PID}" >/dev/null 2>&1
|
||||
@ -926,7 +930,7 @@ def diagnose_segment(row):
|
||||
"Packets are reaching the server but one sink is missing its due time.",
|
||||
"Tune server scheduler/sink handoff and avoid trusting offset-only fixes until lateness falls.",
|
||||
)
|
||||
if over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 250):
|
||||
if over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 120):
|
||||
add_finding(
|
||||
findings,
|
||||
"server_sink_handoff",
|
||||
@ -957,7 +961,7 @@ def diagnose_segment(row):
|
||||
and not over(row, "planner_microphone_client_queue_age_p95_ms_after", 150)
|
||||
and not over(row, "planner_camera_sink_late_p95_ms_after", 120)
|
||||
and not over(row, "planner_microphone_sink_late_p95_ms_after", 120)
|
||||
and not over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 250)
|
||||
and not over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 120)
|
||||
)
|
||||
if stable_blind_metrics and abs_exceeds(row, "planner_sink_handoff_skew_ms_after", 35):
|
||||
add_finding(
|
||||
@ -1302,7 +1306,7 @@ root_cause = {
|
||||
"max_server_receive_abs_skew_p95_ms": 250,
|
||||
"max_client_queue_age_p95_ms": 150,
|
||||
"max_sink_late_p95_ms": 120,
|
||||
"max_sink_handoff_abs_skew_p95_ms": 250,
|
||||
"max_sink_handoff_abs_skew_p95_ms": 120,
|
||||
"stable_sink_handoff_deadband_ms": 35,
|
||||
"acceptable_probe_p95_abs_skew_ms": 80,
|
||||
},
|
||||
|
||||
@ -10,7 +10,7 @@ bench = false
|
||||
|
||||
[package]
|
||||
name = "lesavka_server"
|
||||
version = "0.17.31"
|
||||
version = "0.17.32"
|
||||
edition = "2024"
|
||||
autobins = false
|
||||
|
||||
|
||||
@ -186,7 +186,7 @@ impl BlindHealConfig {
|
||||
#[cfg(not(coverage))]
|
||||
fn from_env() -> Self {
|
||||
Self {
|
||||
enabled: env_bool("LESAVKA_UPSTREAM_BLIND_HEAL", true),
|
||||
enabled: env_bool("LESAVKA_UPSTREAM_BLIND_HEAL", false),
|
||||
target: match std::env::var("LESAVKA_UPSTREAM_BLIND_HEAL_TARGET")
|
||||
.unwrap_or_else(|_| "video".to_string())
|
||||
.trim()
|
||||
@ -200,7 +200,7 @@ impl BlindHealConfig {
|
||||
deadband_ms: env_f64("LESAVKA_UPSTREAM_BLIND_HEAL_DEADBAND_MS", 35.0),
|
||||
max_handoff_abs_p95_ms: env_f64(
|
||||
"LESAVKA_UPSTREAM_BLIND_HEAL_MAX_HANDOFF_P95_MS",
|
||||
250.0,
|
||||
120.0,
|
||||
),
|
||||
max_client_send_abs_p95_ms: env_f64(
|
||||
"LESAVKA_UPSTREAM_BLIND_HEAL_MAX_CLIENT_SEND_P95_MS",
|
||||
@ -278,6 +278,7 @@ fn env_f64(name: &str, default: f64) -> f64 {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serial_test::serial;
|
||||
|
||||
fn config() -> BlindHealConfig {
|
||||
BlindHealConfig {
|
||||
@ -285,7 +286,7 @@ mod tests {
|
||||
target: BlindHealTarget::Video,
|
||||
min_samples: 30,
|
||||
deadband_ms: 35.0,
|
||||
max_handoff_abs_p95_ms: 250.0,
|
||||
max_handoff_abs_p95_ms: 120.0,
|
||||
max_client_send_abs_p95_ms: 250.0,
|
||||
max_server_receive_abs_p95_ms: 250.0,
|
||||
max_queue_age_p95_ms: 150.0,
|
||||
@ -337,6 +338,22 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(coverage))]
|
||||
#[test]
|
||||
#[serial]
|
||||
fn blind_healer_env_defaults_disabled_and_accepts_opt_in() {
|
||||
temp_env::with_var_unset("LESAVKA_UPSTREAM_BLIND_HEAL", || {
|
||||
assert!(
|
||||
!BlindHealConfig::from_env().enabled,
|
||||
"blind healing must not mutate calibration unless the server opted in"
|
||||
);
|
||||
});
|
||||
|
||||
temp_env::with_var("LESAVKA_UPSTREAM_BLIND_HEAL", Some("1"), || {
|
||||
assert!(BlindHealConfig::from_env().enabled);
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn blind_healer_nudges_video_opposite_sink_handoff_skew() {
|
||||
let decision = evaluate_blind_heal_snapshot(&snapshot(), config());
|
||||
@ -367,6 +384,13 @@ mod tests {
|
||||
evaluate_blind_heal_snapshot(&noisy_network, config()),
|
||||
BlindHealDecision::Wait("server-receive-p95-unstable")
|
||||
);
|
||||
|
||||
let mut noisy_handoff = snapshot();
|
||||
noisy_handoff.sink_handoff_abs_skew_p95_ms = Some(241.0);
|
||||
assert_eq!(
|
||||
evaluate_blind_heal_snapshot(&noisy_handoff, config()),
|
||||
BlindHealDecision::Wait("sink-handoff-p95-unstable")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@ -200,6 +200,7 @@ fn mirrored_sync_script_uses_real_client_capture_path() {
|
||||
"provisional calibration not saved",
|
||||
"calibration apply refused: ${calibration_decision_note}",
|
||||
"calibrate \"${calibration_apply_audio_delta_us}\" \"${calibration_apply_video_delta_us}\"",
|
||||
"LESAVKA_UPSTREAM_BLIND_HEAL is server-side",
|
||||
"calibration-save-default",
|
||||
"print_upstream_sync_state \"after mirrored run\"",
|
||||
"print_upstream_calibration_state \"after mirrored run\"",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user