diff --git a/AGENTS.md b/AGENTS.md index 7c42ab6..aeff035 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -594,5 +594,21 @@ camera playout loop woke up after waiting for the audio master. - [x] Keep polling and timing inbound camera packets while video waits for its due time. - [x] Coalesce pending video to the freshest packet during those waits so the server does not build a stale video backlog. - [x] Add regression coverage that video timing is recorded at enqueue/drain time before scheduler waits. -- [ ] Re-run the probe-calibrate-confirm flow; `planner_server_receive_abs_skew_p95_ms` should fall if this was the receive-side scheduling leak. +- [x] Re-run the probe-calibrate-confirm flow; `planner_server_receive_abs_skew_p95_ms` should fall if this was the receive-side scheduling leak. - [ ] If receive p95 remains high after this, inspect actual gRPC/HTTP2 stream delivery and OS/network scheduling rather than static calibration. + +## 0.17.32 Blind Heal Opt-In And Stability Checklist + +Context: the 0.17.31 mirrored run confirmed the receive-side drain worked: client send and server +receive p95 both stayed near 50ms. The run still failed as probe pairing, and the server-side blind +healer silently changed calibration during the probe run because it was enabled by default and allowed +sink handoff p95 near 240ms. + +- [x] Treat the 0.17.31 run as confirmation that the server receive scheduling leak is fixed. +- [x] Default runtime blind healing to disabled so probe-calibration runs cannot be contaminated by hidden server nudges. +- [x] Require explicit server-side `LESAVKA_UPSTREAM_BLIND_HEAL=1` before blind healing mutates transient calibration. +- [x] Tighten the blind-heal sink handoff p95 gate from 250ms to 120ms before applying runtime nudges. +- [x] Align mirrored-run root-cause summaries with the stricter sink handoff stability threshold. +- [x] Add regression coverage for default-disabled blind healing and noisy sink-handoff refusal. +- [ ] Re-run the normal probe-calibrate-confirm flow; `calibration_source` should remain non-blind unless the server was explicitly started with blind healing. +- [ ] If the probe still produces only one or two visual events while blind metrics stay stable, move the next fix to stimulus/browser/probe detection instead of transport timing. diff --git a/Cargo.lock b/Cargo.lock index cdc6e55..f67777f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1652,7 +1652,7 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "lesavka_client" -version = "0.17.31" +version = "0.17.32" dependencies = [ "anyhow", "async-stream", @@ -1686,7 +1686,7 @@ dependencies = [ [[package]] name = "lesavka_common" -version = "0.17.31" +version = "0.17.32" dependencies = [ "anyhow", "base64", @@ -1698,7 +1698,7 @@ dependencies = [ [[package]] name = "lesavka_server" -version = "0.17.31" +version = "0.17.32" dependencies = [ "anyhow", "base64", diff --git a/client/Cargo.toml b/client/Cargo.toml index a108dbf..c3f4abb 100644 --- a/client/Cargo.toml +++ b/client/Cargo.toml @@ -4,7 +4,7 @@ path = "src/main.rs" [package] name = "lesavka_client" -version = "0.17.31" +version = "0.17.32" edition = "2024" [dependencies] diff --git a/common/Cargo.toml b/common/Cargo.toml index f844420..6e16ec8 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lesavka_common" -version = "0.17.31" +version = "0.17.32" edition = "2024" build = "build.rs" diff --git a/scripts/manual/run_upstream_mirrored_av_sync.sh b/scripts/manual/run_upstream_mirrored_av_sync.sh index 4d06513..9ffc976 100755 --- a/scripts/manual/run_upstream_mirrored_av_sync.sh +++ b/scripts/manual/run_upstream_mirrored_av_sync.sh @@ -95,6 +95,10 @@ export LESAVKA_SYNC_CONFIRM_AFTER_CALIBRATION export LESAVKA_SYNC_CONFIRMATION_SEGMENTS export LESAVKA_SYNC_REQUIRE_CONFIRMATION_PASS +if [[ "${LESAVKA_UPSTREAM_BLIND_HEAL:-0}" == "1" && "${LESAVKA_SERVER_ADDR}" == "auto" ]]; then + echo "==> note: LESAVKA_UPSTREAM_BLIND_HEAL is server-side; set it where lesavka-server runs to enable blind healing" +fi + cleanup() { set +e [[ -n "${CLIENT_PID}" ]] && kill "${CLIENT_PID}" >/dev/null 2>&1 @@ -926,7 +930,7 @@ def diagnose_segment(row): "Packets are reaching the server but one sink is missing its due time.", "Tune server scheduler/sink handoff and avoid trusting offset-only fixes until lateness falls.", ) - if over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 250): + if over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 120): add_finding( findings, "server_sink_handoff", @@ -957,7 +961,7 @@ def diagnose_segment(row): and not over(row, "planner_microphone_client_queue_age_p95_ms_after", 150) and not over(row, "planner_camera_sink_late_p95_ms_after", 120) and not over(row, "planner_microphone_sink_late_p95_ms_after", 120) - and not over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 250) + and not over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 120) ) if stable_blind_metrics and abs_exceeds(row, "planner_sink_handoff_skew_ms_after", 35): add_finding( @@ -1302,7 +1306,7 @@ root_cause = { "max_server_receive_abs_skew_p95_ms": 250, "max_client_queue_age_p95_ms": 150, "max_sink_late_p95_ms": 120, - "max_sink_handoff_abs_skew_p95_ms": 250, + "max_sink_handoff_abs_skew_p95_ms": 120, "stable_sink_handoff_deadband_ms": 35, "acceptable_probe_p95_abs_skew_ms": 80, }, diff --git a/server/Cargo.toml b/server/Cargo.toml index 2b9b70f..68340c6 100644 --- a/server/Cargo.toml +++ b/server/Cargo.toml @@ -10,7 +10,7 @@ bench = false [package] name = "lesavka_server" -version = "0.17.31" +version = "0.17.32" edition = "2024" autobins = false diff --git a/server/src/blind_healer.rs b/server/src/blind_healer.rs index 2f31b81..70c3966 100644 --- a/server/src/blind_healer.rs +++ b/server/src/blind_healer.rs @@ -186,7 +186,7 @@ impl BlindHealConfig { #[cfg(not(coverage))] fn from_env() -> Self { Self { - enabled: env_bool("LESAVKA_UPSTREAM_BLIND_HEAL", true), + enabled: env_bool("LESAVKA_UPSTREAM_BLIND_HEAL", false), target: match std::env::var("LESAVKA_UPSTREAM_BLIND_HEAL_TARGET") .unwrap_or_else(|_| "video".to_string()) .trim() @@ -200,7 +200,7 @@ impl BlindHealConfig { deadband_ms: env_f64("LESAVKA_UPSTREAM_BLIND_HEAL_DEADBAND_MS", 35.0), max_handoff_abs_p95_ms: env_f64( "LESAVKA_UPSTREAM_BLIND_HEAL_MAX_HANDOFF_P95_MS", - 250.0, + 120.0, ), max_client_send_abs_p95_ms: env_f64( "LESAVKA_UPSTREAM_BLIND_HEAL_MAX_CLIENT_SEND_P95_MS", @@ -278,6 +278,7 @@ fn env_f64(name: &str, default: f64) -> f64 { #[cfg(test)] mod tests { use super::*; + use serial_test::serial; fn config() -> BlindHealConfig { BlindHealConfig { @@ -285,7 +286,7 @@ mod tests { target: BlindHealTarget::Video, min_samples: 30, deadband_ms: 35.0, - max_handoff_abs_p95_ms: 250.0, + max_handoff_abs_p95_ms: 120.0, max_client_send_abs_p95_ms: 250.0, max_server_receive_abs_p95_ms: 250.0, max_queue_age_p95_ms: 150.0, @@ -337,6 +338,22 @@ mod tests { } } + #[cfg(not(coverage))] + #[test] + #[serial] + fn blind_healer_env_defaults_disabled_and_accepts_opt_in() { + temp_env::with_var_unset("LESAVKA_UPSTREAM_BLIND_HEAL", || { + assert!( + !BlindHealConfig::from_env().enabled, + "blind healing must not mutate calibration unless the server opted in" + ); + }); + + temp_env::with_var("LESAVKA_UPSTREAM_BLIND_HEAL", Some("1"), || { + assert!(BlindHealConfig::from_env().enabled); + }); + } + #[test] fn blind_healer_nudges_video_opposite_sink_handoff_skew() { let decision = evaluate_blind_heal_snapshot(&snapshot(), config()); @@ -367,6 +384,13 @@ mod tests { evaluate_blind_heal_snapshot(&noisy_network, config()), BlindHealDecision::Wait("server-receive-p95-unstable") ); + + let mut noisy_handoff = snapshot(); + noisy_handoff.sink_handoff_abs_skew_p95_ms = Some(241.0); + assert_eq!( + evaluate_blind_heal_snapshot(&noisy_handoff, config()), + BlindHealDecision::Wait("sink-handoff-p95-unstable") + ); } #[test] diff --git a/testing/tests/client_manual_sync_script_contract.rs b/testing/tests/client_manual_sync_script_contract.rs index f1d102d..13b1c40 100644 --- a/testing/tests/client_manual_sync_script_contract.rs +++ b/testing/tests/client_manual_sync_script_contract.rs @@ -200,6 +200,7 @@ fn mirrored_sync_script_uses_real_client_capture_path() { "provisional calibration not saved", "calibration apply refused: ${calibration_decision_note}", "calibrate \"${calibration_apply_audio_delta_us}\" \"${calibration_apply_video_delta_us}\"", + "LESAVKA_UPSTREAM_BLIND_HEAL is server-side", "calibration-save-default", "print_upstream_sync_state \"after mirrored run\"", "print_upstream_calibration_state \"after mirrored run\"",