fix: make blind healing opt in

This commit is contained in:
Brad Stein 2026-05-02 20:42:47 -03:00
parent f0f204b777
commit 060e09336e
8 changed files with 58 additions and 13 deletions

View File

@ -594,5 +594,21 @@ camera playout loop woke up after waiting for the audio master.
- [x] Keep polling and timing inbound camera packets while video waits for its due time.
- [x] Coalesce pending video to the freshest packet during those waits so the server does not build a stale video backlog.
- [x] Add regression coverage that video timing is recorded at enqueue/drain time before scheduler waits.
- [ ] Re-run the probe-calibrate-confirm flow; `planner_server_receive_abs_skew_p95_ms` should fall if this was the receive-side scheduling leak.
- [x] Re-run the probe-calibrate-confirm flow; `planner_server_receive_abs_skew_p95_ms` should fall if this was the receive-side scheduling leak.
- [ ] If receive p95 remains high after this, inspect actual gRPC/HTTP2 stream delivery and OS/network scheduling rather than static calibration.
## 0.17.32 Blind Heal Opt-In And Stability Checklist
Context: the 0.17.31 mirrored run confirmed the receive-side drain worked: client send and server
receive p95 both stayed near 50ms. The run still failed as probe pairing, and the server-side blind
healer silently changed calibration during the probe run because it was enabled by default and allowed
sink handoff p95 near 240ms.
- [x] Treat the 0.17.31 run as confirmation that the server receive scheduling leak is fixed.
- [x] Default runtime blind healing to disabled so probe-calibration runs cannot be contaminated by hidden server nudges.
- [x] Require explicit server-side `LESAVKA_UPSTREAM_BLIND_HEAL=1` before blind healing mutates transient calibration.
- [x] Tighten the blind-heal sink handoff p95 gate from 250ms to 120ms before applying runtime nudges.
- [x] Align mirrored-run root-cause summaries with the stricter sink handoff stability threshold.
- [x] Add regression coverage for default-disabled blind healing and noisy sink-handoff refusal.
- [ ] Re-run the normal probe-calibrate-confirm flow; `calibration_source` should remain non-blind unless the server was explicitly started with blind healing.
- [ ] If the probe still produces only one or two visual events while blind metrics stay stable, move the next fix to stimulus/browser/probe detection instead of transport timing.

6
Cargo.lock generated
View File

@ -1652,7 +1652,7 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
[[package]]
name = "lesavka_client"
version = "0.17.31"
version = "0.17.32"
dependencies = [
"anyhow",
"async-stream",
@ -1686,7 +1686,7 @@ dependencies = [
[[package]]
name = "lesavka_common"
version = "0.17.31"
version = "0.17.32"
dependencies = [
"anyhow",
"base64",
@ -1698,7 +1698,7 @@ dependencies = [
[[package]]
name = "lesavka_server"
version = "0.17.31"
version = "0.17.32"
dependencies = [
"anyhow",
"base64",

View File

@ -4,7 +4,7 @@ path = "src/main.rs"
[package]
name = "lesavka_client"
version = "0.17.31"
version = "0.17.32"
edition = "2024"
[dependencies]

View File

@ -1,6 +1,6 @@
[package]
name = "lesavka_common"
version = "0.17.31"
version = "0.17.32"
edition = "2024"
build = "build.rs"

View File

@ -95,6 +95,10 @@ export LESAVKA_SYNC_CONFIRM_AFTER_CALIBRATION
export LESAVKA_SYNC_CONFIRMATION_SEGMENTS
export LESAVKA_SYNC_REQUIRE_CONFIRMATION_PASS
if [[ "${LESAVKA_UPSTREAM_BLIND_HEAL:-0}" == "1" && "${LESAVKA_SERVER_ADDR}" == "auto" ]]; then
echo "==> note: LESAVKA_UPSTREAM_BLIND_HEAL is server-side; set it where lesavka-server runs to enable blind healing"
fi
cleanup() {
set +e
[[ -n "${CLIENT_PID}" ]] && kill "${CLIENT_PID}" >/dev/null 2>&1
@ -926,7 +930,7 @@ def diagnose_segment(row):
"Packets are reaching the server but one sink is missing its due time.",
"Tune server scheduler/sink handoff and avoid trusting offset-only fixes until lateness falls.",
)
if over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 250):
if over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 120):
add_finding(
findings,
"server_sink_handoff",
@ -957,7 +961,7 @@ def diagnose_segment(row):
and not over(row, "planner_microphone_client_queue_age_p95_ms_after", 150)
and not over(row, "planner_camera_sink_late_p95_ms_after", 120)
and not over(row, "planner_microphone_sink_late_p95_ms_after", 120)
and not over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 250)
and not over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 120)
)
if stable_blind_metrics and abs_exceeds(row, "planner_sink_handoff_skew_ms_after", 35):
add_finding(
@ -1302,7 +1306,7 @@ root_cause = {
"max_server_receive_abs_skew_p95_ms": 250,
"max_client_queue_age_p95_ms": 150,
"max_sink_late_p95_ms": 120,
"max_sink_handoff_abs_skew_p95_ms": 250,
"max_sink_handoff_abs_skew_p95_ms": 120,
"stable_sink_handoff_deadband_ms": 35,
"acceptable_probe_p95_abs_skew_ms": 80,
},

View File

@ -10,7 +10,7 @@ bench = false
[package]
name = "lesavka_server"
version = "0.17.31"
version = "0.17.32"
edition = "2024"
autobins = false

View File

@ -186,7 +186,7 @@ impl BlindHealConfig {
#[cfg(not(coverage))]
fn from_env() -> Self {
Self {
enabled: env_bool("LESAVKA_UPSTREAM_BLIND_HEAL", true),
enabled: env_bool("LESAVKA_UPSTREAM_BLIND_HEAL", false),
target: match std::env::var("LESAVKA_UPSTREAM_BLIND_HEAL_TARGET")
.unwrap_or_else(|_| "video".to_string())
.trim()
@ -200,7 +200,7 @@ impl BlindHealConfig {
deadband_ms: env_f64("LESAVKA_UPSTREAM_BLIND_HEAL_DEADBAND_MS", 35.0),
max_handoff_abs_p95_ms: env_f64(
"LESAVKA_UPSTREAM_BLIND_HEAL_MAX_HANDOFF_P95_MS",
250.0,
120.0,
),
max_client_send_abs_p95_ms: env_f64(
"LESAVKA_UPSTREAM_BLIND_HEAL_MAX_CLIENT_SEND_P95_MS",
@ -278,6 +278,7 @@ fn env_f64(name: &str, default: f64) -> f64 {
#[cfg(test)]
mod tests {
use super::*;
use serial_test::serial;
fn config() -> BlindHealConfig {
BlindHealConfig {
@ -285,7 +286,7 @@ mod tests {
target: BlindHealTarget::Video,
min_samples: 30,
deadband_ms: 35.0,
max_handoff_abs_p95_ms: 250.0,
max_handoff_abs_p95_ms: 120.0,
max_client_send_abs_p95_ms: 250.0,
max_server_receive_abs_p95_ms: 250.0,
max_queue_age_p95_ms: 150.0,
@ -337,6 +338,22 @@ mod tests {
}
}
#[cfg(not(coverage))]
#[test]
#[serial]
fn blind_healer_env_defaults_disabled_and_accepts_opt_in() {
temp_env::with_var_unset("LESAVKA_UPSTREAM_BLIND_HEAL", || {
assert!(
!BlindHealConfig::from_env().enabled,
"blind healing must not mutate calibration unless the server opted in"
);
});
temp_env::with_var("LESAVKA_UPSTREAM_BLIND_HEAL", Some("1"), || {
assert!(BlindHealConfig::from_env().enabled);
});
}
#[test]
fn blind_healer_nudges_video_opposite_sink_handoff_skew() {
let decision = evaluate_blind_heal_snapshot(&snapshot(), config());
@ -367,6 +384,13 @@ mod tests {
evaluate_blind_heal_snapshot(&noisy_network, config()),
BlindHealDecision::Wait("server-receive-p95-unstable")
);
let mut noisy_handoff = snapshot();
noisy_handoff.sink_handoff_abs_skew_p95_ms = Some(241.0);
assert_eq!(
evaluate_blind_heal_snapshot(&noisy_handoff, config()),
BlindHealDecision::Wait("sink-handoff-p95-unstable")
);
}
#[test]

View File

@ -200,6 +200,7 @@ fn mirrored_sync_script_uses_real_client_capture_path() {
"provisional calibration not saved",
"calibration apply refused: ${calibration_decision_note}",
"calibrate \"${calibration_apply_audio_delta_us}\" \"${calibration_apply_video_delta_us}\"",
"LESAVKA_UPSTREAM_BLIND_HEAL is server-side",
"calibration-save-default",
"print_upstream_sync_state \"after mirrored run\"",
"print_upstream_calibration_state \"after mirrored run\"",