feat: add runtime blind sync healer

This commit is contained in:
Brad Stein 2026-05-02 17:48:45 -03:00
parent e1fb31235f
commit 826bada865
16 changed files with 762 additions and 6 deletions

View File

@ -525,3 +525,17 @@ measurement gaps before tuning any new healing controller.
- [x] Include rolling blind metrics in mirrored-probe CSV/JSONL summaries and blind targets.
- [x] Add planner tests for rolling timing windows and sink handoff timing.
- [ ] Use the next mirrored run only for correlation/tuning: decide whether the controller should adjust playout delay, offset, or drop/freeze policy from these blind metrics.
## 0.17.27 Runtime Blind Healing Checklist
Context: if client/server-only timing is stable enough, Lesavka should make
small runtime corrections without waiting for the external probe. The probe
remains the truth judge and root-cause localizer, not the production dependency.
- [x] Add a server-owned blind healer loop enabled by default with `LESAVKA_UPSTREAM_BLIND_HEAL=0` escape hatch.
- [x] Gate blind healing on rolling sample counts plus client-send, server-receive, queue-age, sink-late, and sink-handoff p95 limits.
- [x] Apply bounded transient offset nudges from sink handoff skew without saving them as site defaults.
- [x] Expose sample counts in `upstream-sync` and mirrored probe artifacts so failed runs can separate "insufficient evidence" from real timing failure.
- [x] Emit `root-cause-summary.json` from mirrored probe runs to classify failing layers instead of eyeballing raw metrics.
- [x] Add unit tests for apply/refuse/target behavior in the blind healer.
- [ ] Next run should identify the failing layer if confirmation still fails: client capture/uplink, network/server receive, server planner, server sink handoff, or external USB/browser/probe boundary.

6
Cargo.lock generated
View File

@ -1652,7 +1652,7 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
[[package]]
name = "lesavka_client"
version = "0.17.26"
version = "0.17.27"
dependencies = [
"anyhow",
"async-stream",
@ -1686,7 +1686,7 @@ dependencies = [
[[package]]
name = "lesavka_common"
version = "0.17.26"
version = "0.17.27"
dependencies = [
"anyhow",
"base64",
@ -1698,7 +1698,7 @@ dependencies = [
[[package]]
name = "lesavka_server"
version = "0.17.26"
version = "0.17.27"
dependencies = [
"anyhow",
"base64",

View File

@ -4,7 +4,7 @@ path = "src/main.rs"
[package]
name = "lesavka_client"
version = "0.17.26"
version = "0.17.27"
edition = "2024"
[dependencies]

View File

@ -398,6 +398,14 @@ fn print_upstream_sync(state: lesavka_common::lesavka::UpstreamSyncState) {
.map(|value| format!("{value:.1}"))
.unwrap_or_else(|| "pending".to_string())
);
println!(
"planner_client_timing_window_samples={}",
state.client_timing_window_samples
);
println!(
"planner_sink_handoff_window_samples={}",
state.sink_handoff_window_samples
);
println!("planner_detail={}", state.last_reason);
}

View File

@ -1,6 +1,6 @@
[package]
name = "lesavka_common"
version = "0.17.26"
version = "0.17.27"
edition = "2024"
build = "build.rs"

View File

@ -136,6 +136,8 @@ message UpstreamSyncState {
optional float microphone_sink_late_ms = 31;
optional float camera_sink_late_p95_ms = 32;
optional float microphone_sink_late_p95_ms = 33;
uint64 client_timing_window_samples = 34;
uint64 sink_handoff_window_samples = 35;
}
message HandshakeSet {

View File

@ -733,6 +733,7 @@ summarize_adaptive_probe_metrics() {
python3 - "${ARTIFACT_DIR}" "${LESAVKA_SYNC_TOTAL_SEGMENTS}" "${LESAVKA_SYNC_CALIBRATION_SEGMENTS}" <<'PY'
import csv
import json
import math
import os
import sys
from pathlib import Path
@ -788,8 +789,204 @@ def range_for(rows, key):
}
def is_number(value):
return isinstance(value, (int, float)) and math.isfinite(float(value))
def abs_exceeds(row, key, threshold):
value = row.get(key)
return is_number(value) and abs(float(value)) > threshold
def over(row, key, threshold):
value = row.get(key)
return is_number(value) and float(value) > threshold
def under(row, key, threshold):
value = row.get(key)
return not is_number(value) or float(value) < threshold
def add_finding(findings, layer, severity, signal, detail, next_step):
findings.append({
"layer": layer,
"severity": severity,
"signal": signal,
"detail": detail,
"next_step": next_step,
})
def diagnose_segment(row):
findings = []
paired = row.get("probe_paired_pulses")
status = row.get("probe_status", "missing")
reason = row.get("analysis_failure_reason") or row.get("calibration_note") or row.get("decision_note") or ""
if row.get("probe_passed"):
add_finding(
findings,
"none",
"info",
"probe_passed",
"The external probe judged this segment inside the acceptable sync band.",
"Use this segment as a candidate blind target only if confirmation also passes.",
)
return findings
if "video did not contain any recognizable color-coded sync pulses" in reason:
add_finding(
findings,
"probe_video",
"blocker",
"no_video_sync_pulses",
"The analyzer could not see the visual sync code in the browser capture.",
"Fix camera framing/focus/exposure or stimulus visibility before treating sync numbers as real.",
)
elif isinstance(paired, int) and paired < 3:
add_finding(
findings,
"probe_pairing",
"blocker",
"too_few_matching_pairs",
f"The analyzer saw only {paired} matching pulse pairs, below the minimum verdict floor.",
"Treat this as insufficient probe evidence unless raw activity and server blind metrics agree.",
)
elif isinstance(paired, int) and paired < 8:
add_finding(
findings,
"probe_pairing",
"warning",
"low_pair_count",
f"The analyzer saw {paired} paired pulses, enough for a verdict but below calibration-ready quality.",
"Use the result for direction, not persistence; improve audio/visual pulse detection if this repeats.",
)
if under(row, "planner_client_timing_window_samples_after", 30):
add_finding(
findings,
"server_evidence",
"blocker",
"client_timing_window_underfilled",
"The server did not collect enough client timing sidecar samples for blind diagnosis.",
"Keep the sender connected longer or investigate missing timing metadata before calibration.",
)
if under(row, "planner_sink_handoff_window_samples_after", 30):
add_finding(
findings,
"server_evidence",
"blocker",
"sink_handoff_window_underfilled",
"The server did not collect enough audio/video sink handoff samples for blind diagnosis.",
"Keep the mirrored run alive longer or check whether one sink is not presenting steadily.",
)
if over(row, "planner_client_send_abs_skew_p95_ms_after", 250):
add_finding(
findings,
"client_uplink",
"blocker",
"client_send_skew_p95_high",
"Client-side audio/video send timing is already unstable before the server receives it.",
"Look at capture queues, encode pressure, USB device behavior, and client CPU scheduling.",
)
if over(row, "planner_camera_client_queue_age_p95_ms_after", 150) or over(row, "planner_microphone_client_queue_age_p95_ms_after", 150):
add_finding(
findings,
"client_uplink",
"blocker",
"client_queue_age_p95_high",
"One or both client capture queues are aging packets before send.",
"Prefer dropping stale packets or lowering capture/encode pressure instead of adding offset.",
)
if over(row, "planner_server_receive_abs_skew_p95_ms_after", 250):
add_finding(
findings,
"network_receive",
"blocker",
"server_receive_skew_p95_high",
"Audio/video timing becomes unstable between client send and server receive.",
"Treat this as network/gRPC receive jitter; heal freshness with drop/reanchor policy, not static calibration.",
)
if over(row, "planner_camera_sink_late_p95_ms_after", 120) or over(row, "planner_microphone_sink_late_p95_ms_after", 120):
add_finding(
findings,
"server_sink_scheduler",
"blocker",
"sink_late_p95_high",
"Packets are reaching the server but one sink is missing its due time.",
"Tune server scheduler/sink handoff and avoid trusting offset-only fixes until lateness falls.",
)
if over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 250):
add_finding(
findings,
"server_sink_handoff",
"blocker",
"sink_handoff_skew_p95_high",
"Server sink handoff timing is too jittery for a stable blind correction.",
"First reduce handoff jitter; the blind healer should refuse large p95 instability.",
)
required_blind_keys = [
"planner_client_timing_window_samples_after",
"planner_sink_handoff_window_samples_after",
"planner_client_send_abs_skew_p95_ms_after",
"planner_server_receive_abs_skew_p95_ms_after",
"planner_camera_client_queue_age_p95_ms_after",
"planner_microphone_client_queue_age_p95_ms_after",
"planner_camera_sink_late_p95_ms_after",
"planner_microphone_sink_late_p95_ms_after",
"planner_sink_handoff_abs_skew_p95_ms_after",
]
stable_blind_metrics = (
all(is_number(row.get(key)) for key in required_blind_keys)
and not under(row, "planner_client_timing_window_samples_after", 30)
and not under(row, "planner_sink_handoff_window_samples_after", 30)
and not over(row, "planner_client_send_abs_skew_p95_ms_after", 250)
and not over(row, "planner_server_receive_abs_skew_p95_ms_after", 250)
and not over(row, "planner_camera_client_queue_age_p95_ms_after", 150)
and not over(row, "planner_microphone_client_queue_age_p95_ms_after", 150)
and not over(row, "planner_camera_sink_late_p95_ms_after", 120)
and not over(row, "planner_microphone_sink_late_p95_ms_after", 120)
and not over(row, "planner_sink_handoff_abs_skew_p95_ms_after", 250)
)
if stable_blind_metrics and abs_exceeds(row, "planner_sink_handoff_skew_ms_after", 35):
add_finding(
findings,
"server_calibration",
"warning",
"stable_sink_handoff_offset",
"Blind metrics are stable enough and show a consistent server-side handoff offset.",
"The runtime blind healer should make bounded transient nudges from this signal.",
)
if stable_blind_metrics and over(row, "probe_p95_abs_skew_ms", 80):
add_finding(
findings,
"external_boundary",
"warning",
"probe_fails_while_blind_metrics_stable",
"Client/server timing looks stable but the browser probe still sees skew.",
"Investigate USB gadget output, browser capture, physical mic/camera setup, or probe detector limits.",
)
if not findings:
add_finding(
findings,
"unknown",
"warning",
status,
"The segment failed, but no single client/server metric crossed the current diagnostic thresholds.",
"Compare per-pulse events and raise timing trace if this pattern repeats.",
)
return findings
def primary_finding(findings):
severity_rank = {"blocker": 0, "warning": 1, "info": 2}
return sorted(findings, key=lambda item: severity_rank.get(item.get("severity"), 9))[0]
rows = []
event_rows = []
diagnoses = []
for segment in range(1, segment_count + 1):
segment_dir = root / f"segment-{segment}"
report_path = latest_report(segment_dir)
@ -872,11 +1069,27 @@ for segment in range(1, segment_count + 1):
"planner_microphone_sink_late_ms_after": as_float(planner_after.get("planner_microphone_sink_late_ms")),
"planner_camera_sink_late_p95_ms_after": as_float(planner_after.get("planner_camera_sink_late_p95_ms")),
"planner_microphone_sink_late_p95_ms_after": as_float(planner_after.get("planner_microphone_sink_late_p95_ms")),
"planner_client_timing_window_samples_after": as_float(planner_after.get("planner_client_timing_window_samples")),
"planner_sink_handoff_window_samples_after": as_float(planner_after.get("planner_sink_handoff_window_samples")),
"active_audio_offset_us_before": as_float(calibration_before.get("calibration_active_audio_offset_us")),
"active_audio_offset_us_after": as_float(calibration_after.get("calibration_active_audio_offset_us")),
"active_video_offset_us_before": as_float(calibration_before.get("calibration_active_video_offset_us")),
"active_video_offset_us_after": as_float(calibration_after.get("calibration_active_video_offset_us")),
}
findings = diagnose_segment(row)
primary = primary_finding(findings)
row["diagnostic_layer"] = primary.get("layer", "")
row["diagnostic_severity"] = primary.get("severity", "")
row["diagnostic_signal"] = primary.get("signal", "")
row["diagnostic_detail"] = primary.get("detail", "")
diagnoses.append({
"segment": segment,
"segment_phase": phase,
"probe_status": row["probe_status"],
"probe_passed": row["probe_passed"],
"primary": primary,
"findings": findings,
})
rows.append(row)
for event in report.get("paired_events", []):
@ -961,6 +1174,8 @@ if target_source_rows:
"planner_microphone_sink_late_ms_after": range_for(target_source_rows, "planner_microphone_sink_late_ms_after"),
"planner_camera_sink_late_p95_ms_after": range_for(target_source_rows, "planner_camera_sink_late_p95_ms_after"),
"planner_microphone_sink_late_p95_ms_after": range_for(target_source_rows, "planner_microphone_sink_late_p95_ms_after"),
"planner_client_timing_window_samples_after": range_for(target_source_rows, "planner_client_timing_window_samples_after"),
"planner_sink_handoff_window_samples_after": range_for(target_source_rows, "planner_sink_handoff_window_samples_after"),
"active_audio_offset_us_after": range_for(target_source_rows, "active_audio_offset_us_after"),
"active_video_offset_us_after": range_for(target_source_rows, "active_video_offset_us_after"),
"probe_p95_abs_skew_ms": range_for(target_source_rows, "probe_p95_abs_skew_ms"),
@ -1017,6 +1232,55 @@ else:
}
confirmation_path.write_text(json.dumps(confirmation, indent=2, sort_keys=True) + "\n", encoding="utf-8")
root_cause_path = root / "root-cause-summary.json"
severity_rank = {"blocker": 0, "warning": 1, "info": 2}
if confirmation_rows:
diagnostic_source_segments = {row["segment"] for row in confirmation_rows}
diagnostic_source = "confirmation segments"
else:
diagnostic_source_segments = {row["segment"] for row in rows}
diagnostic_source = "all segments"
source_diagnoses = [
diagnosis
for diagnosis in diagnoses
if diagnosis.get("segment") in diagnostic_source_segments
]
if not source_diagnoses:
source_diagnoses = diagnoses
primary_diagnosis = None
if source_diagnoses:
primary_diagnosis = sorted(
source_diagnoses,
key=lambda diagnosis: severity_rank.get(
diagnosis.get("primary", {}).get("severity"),
9,
),
)[0].get("primary")
root_cause = {
"ready": bool(source_diagnoses),
"source": diagnostic_source,
"primary": primary_diagnosis or {
"layer": "missing",
"severity": "blocker",
"signal": "no_segments",
"detail": "No segment diagnostics were available.",
"next_step": "Inspect run logs before trusting this artifact.",
},
"thresholds": {
"min_client_timing_window_samples": 30,
"min_sink_handoff_window_samples": 30,
"max_client_send_abs_skew_p95_ms": 250,
"max_server_receive_abs_skew_p95_ms": 250,
"max_client_queue_age_p95_ms": 150,
"max_sink_late_p95_ms": 120,
"max_sink_handoff_abs_skew_p95_ms": 250,
"stable_sink_handoff_deadband_ms": 35,
"acceptable_probe_p95_abs_skew_ms": 80,
},
"segment_diagnoses": diagnoses,
}
root_cause_path.write_text(json.dumps(root_cause, indent=2, sort_keys=True) + "\n", encoding="utf-8")
print(f" ↪ segment_metrics_csv={csv_path}")
print(f" ↪ segment_metrics_jsonl={jsonl_path}")
print(f" ↪ segment_events_csv={events_csv_path}")
@ -1025,6 +1289,10 @@ print(f" ↪ blind_targets_json={target_path}")
print(f" ↪ blind_targets_ready={str(bool(target.get('ready'))).lower()}")
print(f" ↪ confirmation_summary_json={confirmation_path}")
print(f" ↪ confirmation_passed={str(bool(confirmation.get('passed'))).lower()}")
print(f" ↪ root_cause_summary_json={root_cause_path}")
print(f" ↪ root_cause_layer={root_cause.get('primary', {}).get('layer')}")
print(f" ↪ root_cause_signal={root_cause.get('primary', {}).get('signal')}")
print(f" ↪ root_cause_detail={root_cause.get('primary', {}).get('detail')}")
PY
}

View File

@ -10,7 +10,7 @@ bench = false
[package]
name = "lesavka_server"
version = "0.17.26"
version = "0.17.27"
edition = "2024"
autobins = false

386
server/src/blind_healer.rs Normal file
View File

@ -0,0 +1,386 @@
use std::sync::Arc;
use std::time::Duration;
use crate::calibration::CalibrationStore;
use crate::upstream_media_runtime::{UpstreamMediaRuntime, UpstreamPlannerSnapshot};
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum BlindHealTarget {
Video,
Audio,
}
#[derive(Clone, Copy, Debug)]
struct BlindHealConfig {
enabled: bool,
target: BlindHealTarget,
min_samples: u64,
deadband_ms: f64,
max_handoff_abs_p95_ms: f64,
max_client_send_abs_p95_ms: f64,
max_server_receive_abs_p95_ms: f64,
max_queue_age_p95_ms: f64,
max_sink_late_p95_ms: f64,
gain: f64,
max_step_us: i64,
interval: Duration,
cooldown: Duration,
}
#[derive(Clone, Debug, PartialEq)]
struct BlindHealAction {
audio_delta_us: i64,
video_delta_us: i64,
observed_sink_handoff_skew_ms: f64,
observed_client_send_abs_p95_ms: f64,
note: String,
}
#[cfg(not(coverage))]
pub fn spawn_blind_healer(runtime: Arc<UpstreamMediaRuntime>, calibration: Arc<CalibrationStore>) {
let config = BlindHealConfig::from_env();
if !config.enabled {
tracing::info!("upstream blind healer disabled");
return;
}
tokio::spawn(async move {
let mut last_adjusted_session_id = 0;
let mut last_adjusted_at: Option<tokio::time::Instant> = None;
loop {
tokio::time::sleep(config.interval).await;
let snapshot = runtime.snapshot();
if snapshot.session_id != last_adjusted_session_id {
last_adjusted_session_id = snapshot.session_id;
last_adjusted_at = None;
}
if last_adjusted_at.is_some_and(|last| last.elapsed() < config.cooldown) {
continue;
}
let action = match evaluate_blind_heal_snapshot(&snapshot, config) {
BlindHealDecision::Apply(action) => action,
BlindHealDecision::Wait(reason) => {
tracing::trace!(reason, "upstream blind healer waiting");
continue;
}
};
let state = calibration.apply_transient_blind_estimate(
action.audio_delta_us,
action.video_delta_us,
action.observed_sink_handoff_skew_ms as f32,
action.observed_client_send_abs_p95_ms as f32,
action.note.clone(),
);
last_adjusted_at = Some(tokio::time::Instant::now());
tracing::info!(
session_id = snapshot.session_id,
audio_delta_us = action.audio_delta_us,
video_delta_us = action.video_delta_us,
sink_handoff_skew_ms = action.observed_sink_handoff_skew_ms,
client_send_abs_p95_ms = action.observed_client_send_abs_p95_ms,
active_audio_offset_us = state.active_audio_offset_us,
active_video_offset_us = state.active_video_offset_us,
"upstream blind healer applied transient calibration nudge"
);
}
});
}
#[derive(Clone, Debug, PartialEq)]
enum BlindHealDecision {
Apply(BlindHealAction),
Wait(&'static str),
}
fn evaluate_blind_heal_snapshot(
snapshot: &UpstreamPlannerSnapshot,
config: BlindHealConfig,
) -> BlindHealDecision {
if !config.enabled {
return BlindHealDecision::Wait("disabled");
}
if !matches!(snapshot.phase, "live" | "healing") {
return BlindHealDecision::Wait("not-live");
}
if snapshot.client_timing_window_samples < config.min_samples {
return BlindHealDecision::Wait("not-enough-client-samples");
}
if snapshot.sink_handoff_window_samples < config.min_samples {
return BlindHealDecision::Wait("not-enough-sink-samples");
}
let Some(skew_ms) = snapshot.sink_handoff_skew_ms else {
return BlindHealDecision::Wait("missing-sink-skew");
};
if skew_ms.abs() < config.deadband_ms {
return BlindHealDecision::Wait("inside-deadband");
}
if exceeds(
snapshot.sink_handoff_abs_skew_p95_ms,
config.max_handoff_abs_p95_ms,
) {
return BlindHealDecision::Wait("sink-handoff-p95-unstable");
}
if exceeds(
snapshot.client_send_abs_skew_p95_ms,
config.max_client_send_abs_p95_ms,
) {
return BlindHealDecision::Wait("client-send-p95-unstable");
}
if exceeds(
snapshot.server_receive_abs_skew_p95_ms,
config.max_server_receive_abs_p95_ms,
) {
return BlindHealDecision::Wait("server-receive-p95-unstable");
}
if exceeds(
snapshot.camera_client_queue_age_p95_ms,
config.max_queue_age_p95_ms,
) || exceeds(
snapshot.microphone_client_queue_age_p95_ms,
config.max_queue_age_p95_ms,
) {
return BlindHealDecision::Wait("client-queue-p95-unstable");
}
if exceeds(
snapshot.camera_sink_late_p95_ms,
config.max_sink_late_p95_ms,
) || exceeds(
snapshot.microphone_sink_late_p95_ms,
config.max_sink_late_p95_ms,
) {
return BlindHealDecision::Wait("sink-late-p95-unstable");
}
let correction_us = clamp_step(skew_ms * config.gain * 1000.0, config.max_step_us);
if correction_us == 0 {
return BlindHealDecision::Wait("rounded-zero");
}
let (audio_delta_us, video_delta_us) = match config.target {
BlindHealTarget::Audio => (correction_us, 0),
BlindHealTarget::Video => (0, -correction_us),
};
BlindHealDecision::Apply(BlindHealAction {
audio_delta_us,
video_delta_us,
observed_sink_handoff_skew_ms: skew_ms,
observed_client_send_abs_p95_ms: snapshot.client_send_abs_skew_p95_ms.unwrap_or(0.0),
note: format!(
"runtime blind healer: sink handoff skew {skew_ms:+.1}ms, target={:?}, applying audio {:+.1}ms/video {:+.1}ms",
config.target,
audio_delta_us as f64 / 1000.0,
video_delta_us as f64 / 1000.0
),
})
}
fn exceeds(value: Option<f64>, limit: f64) -> bool {
value.is_none_or(|value| !value.is_finite() || value.abs() > limit)
}
fn clamp_step(value: f64, max_step_us: i64) -> i64 {
let limit = max_step_us.abs().max(1);
(value.round() as i64).clamp(-limit, limit)
}
impl BlindHealConfig {
#[cfg(not(coverage))]
fn from_env() -> Self {
Self {
enabled: env_bool("LESAVKA_UPSTREAM_BLIND_HEAL", true),
target: match std::env::var("LESAVKA_UPSTREAM_BLIND_HEAL_TARGET")
.unwrap_or_else(|_| "video".to_string())
.trim()
.to_ascii_lowercase()
.as_str()
{
"audio" | "mic" | "microphone" => BlindHealTarget::Audio,
_ => BlindHealTarget::Video,
},
min_samples: env_u64("LESAVKA_UPSTREAM_BLIND_HEAL_MIN_SAMPLES", 30),
deadband_ms: env_f64("LESAVKA_UPSTREAM_BLIND_HEAL_DEADBAND_MS", 35.0),
max_handoff_abs_p95_ms: env_f64(
"LESAVKA_UPSTREAM_BLIND_HEAL_MAX_HANDOFF_P95_MS",
250.0,
),
max_client_send_abs_p95_ms: env_f64(
"LESAVKA_UPSTREAM_BLIND_HEAL_MAX_CLIENT_SEND_P95_MS",
250.0,
),
max_server_receive_abs_p95_ms: env_f64(
"LESAVKA_UPSTREAM_BLIND_HEAL_MAX_SERVER_RECEIVE_P95_MS",
250.0,
),
max_queue_age_p95_ms: env_f64(
"LESAVKA_UPSTREAM_BLIND_HEAL_MAX_QUEUE_AGE_P95_MS",
150.0,
),
max_sink_late_p95_ms: env_f64(
"LESAVKA_UPSTREAM_BLIND_HEAL_MAX_SINK_LATE_P95_MS",
120.0,
),
gain: env_f64("LESAVKA_UPSTREAM_BLIND_HEAL_GAIN", 0.25).clamp(0.01, 1.0),
max_step_us: env_i64("LESAVKA_UPSTREAM_BLIND_HEAL_MAX_STEP_US", 25_000)
.abs()
.max(1),
interval: Duration::from_millis(env_u64(
"LESAVKA_UPSTREAM_BLIND_HEAL_INTERVAL_MS",
2_000,
)),
cooldown: Duration::from_millis(env_u64(
"LESAVKA_UPSTREAM_BLIND_HEAL_COOLDOWN_MS",
8_000,
)),
}
}
}
#[cfg(not(coverage))]
fn env_bool(name: &str, default: bool) -> bool {
std::env::var(name)
.ok()
.map(|value| {
let trimmed = value.trim();
!(trimmed.eq_ignore_ascii_case("0")
|| trimmed.eq_ignore_ascii_case("false")
|| trimmed.eq_ignore_ascii_case("no")
|| trimmed.eq_ignore_ascii_case("off"))
})
.unwrap_or(default)
}
#[cfg(not(coverage))]
fn env_u64(name: &str, default: u64) -> u64 {
std::env::var(name)
.ok()
.and_then(|value| value.trim().parse::<u64>().ok())
.filter(|value| *value > 0)
.unwrap_or(default)
}
#[cfg(not(coverage))]
fn env_i64(name: &str, default: i64) -> i64 {
std::env::var(name)
.ok()
.and_then(|value| value.trim().parse::<i64>().ok())
.filter(|value| *value != 0)
.unwrap_or(default)
}
#[cfg(not(coverage))]
fn env_f64(name: &str, default: f64) -> f64 {
std::env::var(name)
.ok()
.and_then(|value| value.trim().parse::<f64>().ok())
.filter(|value| value.is_finite() && *value > 0.0)
.unwrap_or(default)
}
#[cfg(test)]
mod tests {
use super::*;
fn config() -> BlindHealConfig {
BlindHealConfig {
enabled: true,
target: BlindHealTarget::Video,
min_samples: 30,
deadband_ms: 35.0,
max_handoff_abs_p95_ms: 250.0,
max_client_send_abs_p95_ms: 250.0,
max_server_receive_abs_p95_ms: 250.0,
max_queue_age_p95_ms: 150.0,
max_sink_late_p95_ms: 120.0,
gain: 0.25,
max_step_us: 25_000,
interval: Duration::from_millis(2_000),
cooldown: Duration::from_millis(8_000),
}
}
fn snapshot() -> UpstreamPlannerSnapshot {
UpstreamPlannerSnapshot {
session_id: 7,
phase: "live",
latest_camera_remote_pts_us: Some(1),
latest_microphone_remote_pts_us: Some(1),
last_video_presented_pts_us: Some(1),
last_audio_presented_pts_us: Some(1),
live_lag_ms: Some(100.0),
planner_skew_ms: Some(0.0),
stale_audio_drops: 0,
stale_video_drops: 0,
skew_video_drops: 0,
freshness_reanchors: 0,
startup_timeouts: 0,
video_freezes: 0,
last_reason: "live".to_string(),
client_capture_skew_ms: Some(0.0),
client_send_skew_ms: Some(0.0),
server_receive_skew_ms: Some(0.0),
camera_client_queue_age_ms: Some(5.0),
microphone_client_queue_age_ms: Some(5.0),
camera_server_receive_age_ms: Some(5.0),
microphone_server_receive_age_ms: Some(5.0),
client_capture_abs_skew_p95_ms: Some(20.0),
client_send_abs_skew_p95_ms: Some(20.0),
server_receive_abs_skew_p95_ms: Some(20.0),
camera_client_queue_age_p95_ms: Some(20.0),
microphone_client_queue_age_p95_ms: Some(20.0),
sink_handoff_skew_ms: Some(100.0),
sink_handoff_abs_skew_p95_ms: Some(110.0),
camera_sink_late_ms: Some(0.0),
microphone_sink_late_ms: Some(0.0),
camera_sink_late_p95_ms: Some(10.0),
microphone_sink_late_p95_ms: Some(10.0),
client_timing_window_samples: 60,
sink_handoff_window_samples: 60,
}
}
#[test]
fn blind_healer_nudges_video_opposite_sink_handoff_skew() {
let decision = evaluate_blind_heal_snapshot(&snapshot(), config());
assert_eq!(
decision,
BlindHealDecision::Apply(BlindHealAction {
audio_delta_us: 0,
video_delta_us: -25_000,
observed_sink_handoff_skew_ms: 100.0,
observed_client_send_abs_p95_ms: 20.0,
note: "runtime blind healer: sink handoff skew +100.0ms, target=Video, applying audio +0.0ms/video -25.0ms".to_string(),
})
);
}
#[test]
fn blind_healer_refuses_when_under_sampled_or_unstable() {
let mut low_samples = snapshot();
low_samples.sink_handoff_window_samples = 1;
assert_eq!(
evaluate_blind_heal_snapshot(&low_samples, config()),
BlindHealDecision::Wait("not-enough-sink-samples")
);
let mut noisy_network = snapshot();
noisy_network.server_receive_abs_skew_p95_ms = Some(400.0);
assert_eq!(
evaluate_blind_heal_snapshot(&noisy_network, config()),
BlindHealDecision::Wait("server-receive-p95-unstable")
);
}
#[test]
fn blind_healer_can_target_audio_when_requested() {
let mut config = config();
config.target = BlindHealTarget::Audio;
let decision = evaluate_blind_heal_snapshot(&snapshot(), config);
assert!(matches!(
decision,
BlindHealDecision::Apply(BlindHealAction {
audio_delta_us: 25_000,
video_delta_us: 0,
..
})
));
}
}

View File

@ -150,6 +150,36 @@ impl CalibrationStore {
persist_snapshot(&self.path, &state)?;
Ok(state.to_proto())
}
pub fn apply_transient_blind_estimate(
&self,
audio_delta_us: i64,
video_delta_us: i64,
observed_delivery_skew_ms: f32,
observed_enqueue_skew_ms: f32,
note: impl Into<String>,
) -> ProtoCalibrationState {
let mut state = self.state.lock().expect("calibration mutex poisoned");
state.active_audio_offset_us =
clamp_offset(state.active_audio_offset_us.saturating_add(audio_delta_us));
state.active_video_offset_us =
clamp_offset(state.active_video_offset_us.saturating_add(video_delta_us));
state.source = "blind".to_string();
state.confidence = "runtime-estimated".to_string();
let note = note.into();
state.detail = if note.trim().is_empty() {
format!(
"transient blind estimate from relay telemetry: delivery skew {:.1}ms, enqueue skew {:.1}ms",
observed_delivery_skew_ms, observed_enqueue_skew_ms
)
} else {
note
};
touch(&mut state);
self.runtime
.set_playout_offsets(state.active_video_offset_us, state.active_audio_offset_us);
state.to_proto()
}
}
impl CalibrationSnapshot {
@ -736,4 +766,36 @@ mod tests {
},
);
}
#[test]
fn transient_blind_estimate_updates_runtime_without_persisting_active_file_state() {
let dir = tempfile::tempdir().expect("calibration dir");
let path = dir.path().join("calibration.toml");
let path_string = path.to_string_lossy().to_string();
temp_env::with_var(
"LESAVKA_CALIBRATION_PATH",
Some(path_string.as_str()),
|| {
let runtime = Arc::new(UpstreamMediaRuntime::new());
let store = CalibrationStore::load(runtime.clone());
let before_raw = std::fs::read_to_string(&path).ok();
let state = store.apply_transient_blind_estimate(
0,
-12_000,
48.0,
7.0,
"runtime blind healer nudge",
);
assert_eq!(state.source, "blind");
assert_eq!(state.confidence, "runtime-estimated");
assert_eq!(
runtime.playout_offsets(),
(FACTORY_MJPEG_VIDEO_OFFSET_US - 12_000, 0)
);
assert_eq!(std::fs::read_to_string(&path).ok(), before_raw);
},
);
}
}

View File

@ -5,6 +5,7 @@ pub const REVISION: &str = env!("LESAVKA_GIT_SHA");
pub const BUILD_ID: &str = REVISION;
pub mod audio;
pub mod blind_healer;
pub mod calibration;
pub mod camera;
pub mod camera_runtime;

View File

@ -47,6 +47,11 @@ impl Handler {
let upstream_media_rt = Arc::new(UpstreamMediaRuntime::new());
let calibration = Arc::new(CalibrationStore::load(upstream_media_rt.clone()));
#[cfg(not(coverage))]
lesavka_server::blind_healer::spawn_blind_healer(
upstream_media_rt.clone(),
calibration.clone(),
);
Ok(Self {
kb: Arc::new(Mutex::new(kb)),

View File

@ -227,6 +227,8 @@ impl Handler {
microphone_sink_late_p95_ms: snapshot
.microphone_sink_late_p95_ms
.map(|value| value as f32),
client_timing_window_samples: snapshot.client_timing_window_samples,
sink_handoff_window_samples: snapshot.sink_handoff_window_samples,
}))
}
}

View File

@ -256,6 +256,8 @@ impl UpstreamMediaRuntime {
.map(presentation_late_ms),
camera_sink_late_p95_ms: state.camera_sink_late_window_ms.p95(),
microphone_sink_late_p95_ms: state.microphone_sink_late_window_ms.p95(),
client_timing_window_samples: state.client_capture_skew_window_ms.len() as u64,
sink_handoff_window_samples: state.sink_handoff_skew_window_ms.len() as u64,
}
}
}

View File

@ -38,6 +38,10 @@ impl UpstreamScalarWindow {
pub fn p95(&self) -> Option<f64> {
percentile(self.values.iter().copied(), 0.95)
}
pub fn len(&self) -> usize {
self.values.len()
}
}
fn percentile(values: impl Iterator<Item = f64>, quantile: f64) -> Option<f64> {

View File

@ -100,4 +100,6 @@ pub struct UpstreamPlannerSnapshot {
pub microphone_sink_late_ms: Option<f64>,
pub camera_sink_late_p95_ms: Option<f64>,
pub microphone_sink_late_p95_ms: Option<f64>,
pub client_timing_window_samples: u64,
pub sink_handoff_window_samples: u64,
}