fix: make blind healing opt in
This commit is contained in:
parent
f0f204b777
commit
060e09336e
18
AGENTS.md
18
AGENTS.md
@ -594,5 +594,21 @@ camera playout loop woke up after waiting for the audio master.
|
|||||||
- [x] Keep polling and timing inbound camera packets while video waits for its due time.
|
- [x] Keep polling and timing inbound camera packets while video waits for its due time.
|
||||||
- [x] Coalesce pending video to the freshest packet during those waits so the server does not build a stale video backlog.
|
- [x] Coalesce pending video to the freshest packet during those waits so the server does not build a stale video backlog.
|
||||||
- [x] Add regression coverage that video timing is recorded at enqueue/drain time before scheduler waits.
|
- [x] Add regression coverage that video timing is recorded at enqueue/drain time before scheduler waits.
|
||||||
- [ ] Re-run the probe-calibrate-confirm flow; `planner_server_receive_abs_skew_p95_ms` should fall if this was the receive-side scheduling leak.
|
- [x] Re-run the probe-calibrate-confirm flow; `planner_server_receive_abs_skew_p95_ms` should fall if this was the receive-side scheduling leak.
|
||||||
- [ ] If receive p95 remains high after this, inspect actual gRPC/HTTP2 stream delivery and OS/network scheduling rather than static calibration.
|
- [ ] If receive p95 remains high after this, inspect actual gRPC/HTTP2 stream delivery and OS/network scheduling rather than static calibration.
|
||||||
|
|
||||||
|
## 0.17.32 Blind Heal Opt-In And Stability Checklist
|
||||||
|
|
||||||
|
Context: the 0.17.31 mirrored run confirmed the receive-side drain worked: client send and server
|
||||||
|
receive p95 both stayed near 50ms. The run still failed as probe pairing, and the server-side blind
|
||||||
|
healer silently changed calibration during the probe run because it was enabled by default and allowed
|
||||||
|
sink handoff p95 near 240ms.
|
||||||
|
|
||||||
|
- [x] Treat the 0.17.31 run as confirmation that the server receive scheduling leak is fixed.
|
||||||
|
- [x] Default runtime blind healing to disabled so probe-calibration runs cannot be contaminated by hidden server nudges.
|
||||||
|
- [x] Require explicit server-side `LESAVKA_UPSTREAM_BLIND_HEAL=1` before blind healing mutates transient calibration.
|
||||||
|
- [x] Tighten the blind-heal sink handoff p95 gate from 250ms to 120ms before applying runtime nudges.
|
||||||
|
- [x] Align mirrored-run root-cause summaries with the stricter sink handoff stability threshold.
|
||||||
|
- [x] Add regression coverage for default-disabled blind healing and noisy sink-handoff refusal.
|
||||||
|
- [ ] Re-run the normal probe-calibrate-confirm flow; `calibration_source` should remain non-blind unless the server was explicitly started with blind healing.
|
||||||
|
- [ ] If the probe still produces only one or two visual events while blind metrics stay stable, move the next fix to stimulus/browser/probe detection instead of transport timing.
|
||||||
|
|||||||
6
Cargo.lock
generated
6
Cargo.lock
generated
@ -1652,7 +1652,7 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lesavka_client"
|
name = "lesavka_client"
|
||||||
version = "0.17.31"
|
version = "0.17.32"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-stream",
|
"async-stream",
|
||||||
@ -1686,7 +1686,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lesavka_common"
|
name = "lesavka_common"
|
||||||
version = "0.17.31"
|
version = "0.17.32"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"base64",
|
"base64",
|
||||||
@ -1698,7 +1698,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lesavka_server"
|
name = "lesavka_server"
|
||||||
version = "0.17.31"
|
version = "0.17.32"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"base64",
|
"base64",
|
||||||
|
|||||||
@ -4,7 +4,7 @@ path = "src/main.rs"
|
|||||||
|
|
||||||
[package]
|
[package]
|
||||||
name = "lesavka_client"
|
name = "lesavka_client"
|
||||||
version = "0.17.31"
|
version = "0.17.32"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lesavka_common"
|
name = "lesavka_common"
|
||||||
version = "0.17.31"
|
version = "0.17.32"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
build = "build.rs"
|
build = "build.rs"
|
||||||
|
|
||||||
|
|||||||
@ -95,6 +95,10 @@ export LESAVKA_SYNC_CONFIRM_AFTER_CALIBRATION
|
|||||||
export LESAVKA_SYNC_CONFIRMATION_SEGMENTS
|
export LESAVKA_SYNC_CONFIRMATION_SEGMENTS
|
||||||
export LESAVKA_SYNC_REQUIRE_CONFIRMATION_PASS
|
export LESAVKA_SYNC_REQUIRE_CONFIRMATION_PASS
|
||||||
|
|
||||||
|
if [[ "${LESAVKA_UPSTREAM_BLIND_HEAL:-0}" == "1" && "${LESAVKA_SERVER_ADDR}" == "auto" ]]; then
|
||||||
|
echo "==> note: LESAVKA_UPSTREAM_BLIND_HEAL is server-side; set it where lesavka-server runs to enable blind healing"
|
||||||
|
fi
|
||||||
|
|
||||||
cleanup() {
|
cleanup() {
|
||||||
set +e
|
set +e
|
||||||
[[ -n "${CLIENT_PID}" ]] && kill "${CLIENT_PID}" >/dev/null 2>&1
|
[[ -n "${CLIENT_PID}" ]] && kill "${CLIENT_PID}" >/dev/null 2>&1
|
||||||
@ -926,7 +930,7 @@ def diagnose_segment(row):
|
|||||||
"Packets are reaching the server but one sink is missing its due time.",
|
"Packets are reaching the server but one sink is missing its due time.",
|
||||||
"Tune server scheduler/sink handoff and avoid trusting offset-only fixes until lateness falls.",
|
"Tune server scheduler/sink handoff and avoid trusting offset-only fixes until lateness falls.",
|
||||||
)
|
)
|
||||||
if over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 250):
|
if over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 120):
|
||||||
add_finding(
|
add_finding(
|
||||||
findings,
|
findings,
|
||||||
"server_sink_handoff",
|
"server_sink_handoff",
|
||||||
@ -957,7 +961,7 @@ def diagnose_segment(row):
|
|||||||
and not over(row, "planner_microphone_client_queue_age_p95_ms_after", 150)
|
and not over(row, "planner_microphone_client_queue_age_p95_ms_after", 150)
|
||||||
and not over(row, "planner_camera_sink_late_p95_ms_after", 120)
|
and not over(row, "planner_camera_sink_late_p95_ms_after", 120)
|
||||||
and not over(row, "planner_microphone_sink_late_p95_ms_after", 120)
|
and not over(row, "planner_microphone_sink_late_p95_ms_after", 120)
|
||||||
and not over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 250)
|
and not over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 120)
|
||||||
)
|
)
|
||||||
if stable_blind_metrics and abs_exceeds(row, "planner_sink_handoff_skew_ms_after", 35):
|
if stable_blind_metrics and abs_exceeds(row, "planner_sink_handoff_skew_ms_after", 35):
|
||||||
add_finding(
|
add_finding(
|
||||||
@ -1302,7 +1306,7 @@ root_cause = {
|
|||||||
"max_server_receive_abs_skew_p95_ms": 250,
|
"max_server_receive_abs_skew_p95_ms": 250,
|
||||||
"max_client_queue_age_p95_ms": 150,
|
"max_client_queue_age_p95_ms": 150,
|
||||||
"max_sink_late_p95_ms": 120,
|
"max_sink_late_p95_ms": 120,
|
||||||
"max_sink_handoff_abs_skew_p95_ms": 250,
|
"max_sink_handoff_abs_skew_p95_ms": 120,
|
||||||
"stable_sink_handoff_deadband_ms": 35,
|
"stable_sink_handoff_deadband_ms": 35,
|
||||||
"acceptable_probe_p95_abs_skew_ms": 80,
|
"acceptable_probe_p95_abs_skew_ms": 80,
|
||||||
},
|
},
|
||||||
|
|||||||
@ -10,7 +10,7 @@ bench = false
|
|||||||
|
|
||||||
[package]
|
[package]
|
||||||
name = "lesavka_server"
|
name = "lesavka_server"
|
||||||
version = "0.17.31"
|
version = "0.17.32"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
autobins = false
|
autobins = false
|
||||||
|
|
||||||
|
|||||||
@ -186,7 +186,7 @@ impl BlindHealConfig {
|
|||||||
#[cfg(not(coverage))]
|
#[cfg(not(coverage))]
|
||||||
fn from_env() -> Self {
|
fn from_env() -> Self {
|
||||||
Self {
|
Self {
|
||||||
enabled: env_bool("LESAVKA_UPSTREAM_BLIND_HEAL", true),
|
enabled: env_bool("LESAVKA_UPSTREAM_BLIND_HEAL", false),
|
||||||
target: match std::env::var("LESAVKA_UPSTREAM_BLIND_HEAL_TARGET")
|
target: match std::env::var("LESAVKA_UPSTREAM_BLIND_HEAL_TARGET")
|
||||||
.unwrap_or_else(|_| "video".to_string())
|
.unwrap_or_else(|_| "video".to_string())
|
||||||
.trim()
|
.trim()
|
||||||
@ -200,7 +200,7 @@ impl BlindHealConfig {
|
|||||||
deadband_ms: env_f64("LESAVKA_UPSTREAM_BLIND_HEAL_DEADBAND_MS", 35.0),
|
deadband_ms: env_f64("LESAVKA_UPSTREAM_BLIND_HEAL_DEADBAND_MS", 35.0),
|
||||||
max_handoff_abs_p95_ms: env_f64(
|
max_handoff_abs_p95_ms: env_f64(
|
||||||
"LESAVKA_UPSTREAM_BLIND_HEAL_MAX_HANDOFF_P95_MS",
|
"LESAVKA_UPSTREAM_BLIND_HEAL_MAX_HANDOFF_P95_MS",
|
||||||
250.0,
|
120.0,
|
||||||
),
|
),
|
||||||
max_client_send_abs_p95_ms: env_f64(
|
max_client_send_abs_p95_ms: env_f64(
|
||||||
"LESAVKA_UPSTREAM_BLIND_HEAL_MAX_CLIENT_SEND_P95_MS",
|
"LESAVKA_UPSTREAM_BLIND_HEAL_MAX_CLIENT_SEND_P95_MS",
|
||||||
@ -278,6 +278,7 @@ fn env_f64(name: &str, default: f64) -> f64 {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use serial_test::serial;
|
||||||
|
|
||||||
fn config() -> BlindHealConfig {
|
fn config() -> BlindHealConfig {
|
||||||
BlindHealConfig {
|
BlindHealConfig {
|
||||||
@ -285,7 +286,7 @@ mod tests {
|
|||||||
target: BlindHealTarget::Video,
|
target: BlindHealTarget::Video,
|
||||||
min_samples: 30,
|
min_samples: 30,
|
||||||
deadband_ms: 35.0,
|
deadband_ms: 35.0,
|
||||||
max_handoff_abs_p95_ms: 250.0,
|
max_handoff_abs_p95_ms: 120.0,
|
||||||
max_client_send_abs_p95_ms: 250.0,
|
max_client_send_abs_p95_ms: 250.0,
|
||||||
max_server_receive_abs_p95_ms: 250.0,
|
max_server_receive_abs_p95_ms: 250.0,
|
||||||
max_queue_age_p95_ms: 150.0,
|
max_queue_age_p95_ms: 150.0,
|
||||||
@ -337,6 +338,22 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(not(coverage))]
|
||||||
|
#[test]
|
||||||
|
#[serial]
|
||||||
|
fn blind_healer_env_defaults_disabled_and_accepts_opt_in() {
|
||||||
|
temp_env::with_var_unset("LESAVKA_UPSTREAM_BLIND_HEAL", || {
|
||||||
|
assert!(
|
||||||
|
!BlindHealConfig::from_env().enabled,
|
||||||
|
"blind healing must not mutate calibration unless the server opted in"
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
temp_env::with_var("LESAVKA_UPSTREAM_BLIND_HEAL", Some("1"), || {
|
||||||
|
assert!(BlindHealConfig::from_env().enabled);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn blind_healer_nudges_video_opposite_sink_handoff_skew() {
|
fn blind_healer_nudges_video_opposite_sink_handoff_skew() {
|
||||||
let decision = evaluate_blind_heal_snapshot(&snapshot(), config());
|
let decision = evaluate_blind_heal_snapshot(&snapshot(), config());
|
||||||
@ -367,6 +384,13 @@ mod tests {
|
|||||||
evaluate_blind_heal_snapshot(&noisy_network, config()),
|
evaluate_blind_heal_snapshot(&noisy_network, config()),
|
||||||
BlindHealDecision::Wait("server-receive-p95-unstable")
|
BlindHealDecision::Wait("server-receive-p95-unstable")
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let mut noisy_handoff = snapshot();
|
||||||
|
noisy_handoff.sink_handoff_abs_skew_p95_ms = Some(241.0);
|
||||||
|
assert_eq!(
|
||||||
|
evaluate_blind_heal_snapshot(&noisy_handoff, config()),
|
||||||
|
BlindHealDecision::Wait("sink-handoff-p95-unstable")
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@ -200,6 +200,7 @@ fn mirrored_sync_script_uses_real_client_capture_path() {
|
|||||||
"provisional calibration not saved",
|
"provisional calibration not saved",
|
||||||
"calibration apply refused: ${calibration_decision_note}",
|
"calibration apply refused: ${calibration_decision_note}",
|
||||||
"calibrate \"${calibration_apply_audio_delta_us}\" \"${calibration_apply_video_delta_us}\"",
|
"calibrate \"${calibration_apply_audio_delta_us}\" \"${calibration_apply_video_delta_us}\"",
|
||||||
|
"LESAVKA_UPSTREAM_BLIND_HEAL is server-side",
|
||||||
"calibration-save-default",
|
"calibration-save-default",
|
||||||
"print_upstream_sync_state \"after mirrored run\"",
|
"print_upstream_sync_state \"after mirrored run\"",
|
||||||
"print_upstream_calibration_state \"after mirrored run\"",
|
"print_upstream_calibration_state \"after mirrored run\"",
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user