444 lines
15 KiB
Rust
444 lines
15 KiB
Rust
#![forbid(unsafe_code)]
|
|
|
|
use std::collections::VecDeque;
|
|
use std::sync::atomic::{AtomicI64, AtomicU64, Ordering};
|
|
use std::sync::{Arc, Mutex};
|
|
use std::time::Duration;
|
|
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
|
|
use tokio::time::Instant;
|
|
|
|
use crate::calibration::{
|
|
FACTORY_MJPEG_AUDIO_MODE_OFFSETS_US, FACTORY_MJPEG_AUDIO_OFFSET_US,
|
|
FACTORY_MJPEG_VIDEO_MODE_OFFSETS_US, FACTORY_MJPEG_VIDEO_OFFSET_US,
|
|
};
|
|
|
|
const TIMING_WINDOW_CAPACITY: usize = 240;
|
|
|
|
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
|
pub enum UpstreamMediaKind {
|
|
Camera,
|
|
Microphone,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
|
|
pub struct UpstreamClientTiming {
|
|
pub capture_pts_us: u64,
|
|
pub send_pts_us: u64,
|
|
pub queue_depth: u32,
|
|
pub queue_age_ms: u32,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
|
pub struct UpstreamStreamLease {
|
|
pub session_id: u64,
|
|
pub generation: u64,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug)]
|
|
pub struct PlannedUpstreamPacket {
|
|
pub local_pts_us: u64,
|
|
pub due_at: Instant,
|
|
pub late_by: Duration,
|
|
pub source_lag: Duration,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug)]
|
|
pub enum UpstreamPlanDecision {
|
|
AwaitingPair,
|
|
DropBeforeOverlap,
|
|
DropStale(&'static str),
|
|
StartupFailed(&'static str),
|
|
Play(PlannedUpstreamPacket),
|
|
}
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub struct UpstreamPlannerSnapshot {
|
|
pub session_id: u64,
|
|
pub phase: &'static str,
|
|
pub latest_camera_remote_pts_us: Option<u64>,
|
|
pub latest_microphone_remote_pts_us: Option<u64>,
|
|
pub last_video_presented_pts_us: Option<u64>,
|
|
pub last_audio_presented_pts_us: Option<u64>,
|
|
pub live_lag_ms: Option<f64>,
|
|
pub planner_skew_ms: Option<f64>,
|
|
pub stale_audio_drops: u64,
|
|
pub stale_video_drops: u64,
|
|
pub skew_video_drops: u64,
|
|
pub freshness_reanchors: u64,
|
|
pub startup_timeouts: u64,
|
|
pub video_freezes: u64,
|
|
pub last_reason: String,
|
|
pub client_capture_skew_ms: Option<f64>,
|
|
pub client_send_skew_ms: Option<f64>,
|
|
pub server_receive_skew_ms: Option<f64>,
|
|
pub camera_client_queue_age_ms: Option<f64>,
|
|
pub microphone_client_queue_age_ms: Option<f64>,
|
|
pub camera_server_receive_age_ms: Option<f64>,
|
|
pub microphone_server_receive_age_ms: Option<f64>,
|
|
pub client_capture_abs_skew_p95_ms: Option<f64>,
|
|
pub client_send_abs_skew_p95_ms: Option<f64>,
|
|
pub server_receive_abs_skew_p95_ms: Option<f64>,
|
|
pub camera_client_queue_age_p95_ms: Option<f64>,
|
|
pub microphone_client_queue_age_p95_ms: Option<f64>,
|
|
pub sink_handoff_skew_ms: Option<f64>,
|
|
pub sink_handoff_abs_skew_p95_ms: Option<f64>,
|
|
pub camera_sink_late_ms: Option<f64>,
|
|
pub microphone_sink_late_ms: Option<f64>,
|
|
pub camera_sink_late_p95_ms: Option<f64>,
|
|
pub microphone_sink_late_p95_ms: Option<f64>,
|
|
pub client_timing_window_samples: u64,
|
|
pub sink_handoff_window_samples: u64,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug)]
|
|
struct TimingSample {
|
|
capture_pts_us: u64,
|
|
send_pts_us: u64,
|
|
queue_age_ms: u32,
|
|
received_at: Instant,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug)]
|
|
struct PresentationSample {
|
|
due_at: Instant,
|
|
handed_at: Instant,
|
|
}
|
|
|
|
#[derive(Debug, Default)]
|
|
struct ScalarWindow {
|
|
values: VecDeque<f64>,
|
|
}
|
|
|
|
impl ScalarWindow {
|
|
/// Keeps `push` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
|
|
/// Inputs are the typed parameters; output is the return value or side effect.
|
|
fn push(&mut self, value: f64) {
|
|
if self.values.len() >= TIMING_WINDOW_CAPACITY {
|
|
self.values.pop_front();
|
|
}
|
|
self.values.push_back(value);
|
|
}
|
|
|
|
fn p95(&self) -> Option<f64> {
|
|
percentile(self.values.iter().copied(), 0.95)
|
|
}
|
|
|
|
fn p95_abs(&self) -> Option<f64> {
|
|
percentile(self.values.iter().map(|value| value.abs()), 0.95)
|
|
}
|
|
|
|
fn len(&self) -> usize {
|
|
self.values.len()
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
|
|
enum UpstreamSyncPhase {
|
|
#[default]
|
|
Acquiring,
|
|
Syncing,
|
|
Live,
|
|
Healing,
|
|
}
|
|
|
|
impl UpstreamSyncPhase {
|
|
/// Keeps `as_str` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
|
|
/// Inputs are the typed parameters; output is the return value or side effect.
|
|
fn as_str(self) -> &'static str {
|
|
match self {
|
|
Self::Acquiring => "acquiring",
|
|
Self::Syncing => "syncing",
|
|
Self::Live => "live",
|
|
Self::Healing => "healing",
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Default)]
|
|
struct RuntimeState {
|
|
session_id: u64,
|
|
active_camera_generation: Option<u64>,
|
|
active_microphone_generation: Option<u64>,
|
|
phase: UpstreamSyncPhase,
|
|
session_started_at: Option<Instant>,
|
|
base_remote_pts_us: Option<u64>,
|
|
playout_epoch: Option<Instant>,
|
|
latest_camera_remote_pts_us: Option<u64>,
|
|
latest_microphone_remote_pts_us: Option<u64>,
|
|
last_video_local_pts_us: Option<u64>,
|
|
last_audio_local_pts_us: Option<u64>,
|
|
last_video_presented_pts_us: Option<u64>,
|
|
last_audio_presented_pts_us: Option<u64>,
|
|
latest_camera_timing: Option<TimingSample>,
|
|
latest_microphone_timing: Option<TimingSample>,
|
|
latest_camera_presentation: Option<PresentationSample>,
|
|
latest_microphone_presentation: Option<PresentationSample>,
|
|
latest_paired_client_capture_skew_ms: Option<f64>,
|
|
latest_paired_client_send_skew_ms: Option<f64>,
|
|
latest_paired_server_receive_skew_ms: Option<f64>,
|
|
client_capture_skew_window_ms: ScalarWindow,
|
|
client_send_skew_window_ms: ScalarWindow,
|
|
server_receive_skew_window_ms: ScalarWindow,
|
|
camera_client_queue_age_window_ms: ScalarWindow,
|
|
microphone_client_queue_age_window_ms: ScalarWindow,
|
|
sink_handoff_skew_window_ms: ScalarWindow,
|
|
camera_sink_late_window_ms: ScalarWindow,
|
|
microphone_sink_late_window_ms: ScalarWindow,
|
|
stale_audio_drops: u64,
|
|
stale_video_drops: u64,
|
|
skew_video_drops: u64,
|
|
freshness_reanchors: u64,
|
|
startup_timeouts: u64,
|
|
video_freezes: u64,
|
|
last_reason: String,
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct UpstreamMediaRuntime {
|
|
next_session_id: AtomicU64,
|
|
next_camera_generation: AtomicU64,
|
|
next_microphone_generation: AtomicU64,
|
|
microphone_sink_gate: Arc<Semaphore>,
|
|
camera_playout_offset_us: AtomicI64,
|
|
microphone_playout_offset_us: AtomicI64,
|
|
state: Mutex<RuntimeState>,
|
|
}
|
|
|
|
include!("upstream_media_runtime/stream_lifecycle_methods.rs");
|
|
include!("upstream_media_runtime/planner_snapshot_methods.rs");
|
|
include!("upstream_media_runtime/playout_planning_methods.rs");
|
|
|
|
impl Default for UpstreamMediaRuntime {
|
|
fn default() -> Self {
|
|
Self::new()
|
|
}
|
|
}
|
|
|
|
/// Keeps `reset_session_state` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
|
|
/// Inputs are the typed parameters; output is the return value or side effect.
|
|
fn reset_session_state(state: &mut RuntimeState) {
|
|
state.base_remote_pts_us = None;
|
|
state.playout_epoch = None;
|
|
state.latest_camera_remote_pts_us = None;
|
|
state.latest_microphone_remote_pts_us = None;
|
|
state.last_video_local_pts_us = None;
|
|
state.last_audio_local_pts_us = None;
|
|
state.last_video_presented_pts_us = None;
|
|
state.last_audio_presented_pts_us = None;
|
|
state.latest_camera_timing = None;
|
|
state.latest_microphone_timing = None;
|
|
state.latest_camera_presentation = None;
|
|
state.latest_microphone_presentation = None;
|
|
state.latest_paired_client_capture_skew_ms = None;
|
|
state.latest_paired_client_send_skew_ms = None;
|
|
state.latest_paired_server_receive_skew_ms = None;
|
|
state.stale_audio_drops = 0;
|
|
state.stale_video_drops = 0;
|
|
state.skew_video_drops = 0;
|
|
state.freshness_reanchors = 0;
|
|
state.startup_timeouts = 0;
|
|
state.video_freezes = 0;
|
|
}
|
|
|
|
/// Keeps `record_timing_pair` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
|
|
/// Inputs are the typed parameters; output is the return value or side effect.
|
|
fn record_timing_pair(state: &mut RuntimeState) {
|
|
let (Some(camera), Some(microphone)) =
|
|
(state.latest_camera_timing, state.latest_microphone_timing)
|
|
else {
|
|
return;
|
|
};
|
|
let capture_skew_ms = delta_ms(microphone.capture_pts_us, camera.capture_pts_us);
|
|
let send_skew_ms = delta_ms(microphone.send_pts_us, camera.send_pts_us);
|
|
let receive_skew_ms = signed_duration_ms(microphone.received_at, camera.received_at);
|
|
state.latest_paired_client_capture_skew_ms = Some(capture_skew_ms);
|
|
state.latest_paired_client_send_skew_ms = Some(send_skew_ms);
|
|
state.latest_paired_server_receive_skew_ms = Some(receive_skew_ms);
|
|
state.client_capture_skew_window_ms.push(capture_skew_ms);
|
|
state.client_send_skew_window_ms.push(send_skew_ms);
|
|
state.server_receive_skew_window_ms.push(receive_skew_ms);
|
|
}
|
|
|
|
/// Keeps `record_presentation` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
|
|
/// Inputs are the typed parameters; output is the return value or side effect.
|
|
fn record_presentation(state: &mut RuntimeState, kind: UpstreamMediaKind, due_at: Instant) {
|
|
let sample = PresentationSample {
|
|
due_at,
|
|
handed_at: Instant::now(),
|
|
};
|
|
match kind {
|
|
UpstreamMediaKind::Camera => {
|
|
state.latest_camera_presentation = Some(sample);
|
|
state
|
|
.camera_sink_late_window_ms
|
|
.push(presentation_late_ms(sample));
|
|
}
|
|
UpstreamMediaKind::Microphone => {
|
|
state.latest_microphone_presentation = Some(sample);
|
|
state
|
|
.microphone_sink_late_window_ms
|
|
.push(presentation_late_ms(sample));
|
|
}
|
|
}
|
|
if let Some(skew) = latest_sink_handoff_skew_ms(state) {
|
|
state.sink_handoff_skew_window_ms.push(skew);
|
|
}
|
|
}
|
|
|
|
fn live_lag_ms(state: &RuntimeState) -> Option<f64> {
|
|
let latest = state
|
|
.latest_camera_remote_pts_us
|
|
.into_iter()
|
|
.chain(state.latest_microphone_remote_pts_us)
|
|
.max()?;
|
|
let base = state.base_remote_pts_us.unwrap_or(latest);
|
|
Some(latest.saturating_sub(base) as f64 / 1000.0)
|
|
}
|
|
|
|
/// Keeps `planner_skew_ms` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
|
|
/// Inputs are the typed parameters; output is the return value or side effect.
|
|
fn planner_skew_ms(state: &RuntimeState) -> Option<f64> {
|
|
match (
|
|
state.last_audio_presented_pts_us,
|
|
state.last_video_presented_pts_us,
|
|
) {
|
|
(Some(audio), Some(video)) => Some((audio as i128 - video as i128) as f64 / 1000.0),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
fn latest_sink_handoff_skew_ms(state: &RuntimeState) -> Option<f64> {
|
|
let camera = state.latest_camera_presentation?;
|
|
let microphone = state.latest_microphone_presentation?;
|
|
Some(presentation_late_signed_ms(microphone) - presentation_late_signed_ms(camera))
|
|
}
|
|
|
|
fn presentation_late_ms(sample: PresentationSample) -> f64 {
|
|
presentation_late_signed_ms(sample).max(0.0)
|
|
}
|
|
|
|
fn presentation_late_signed_ms(sample: PresentationSample) -> f64 {
|
|
signed_duration_ms(sample.handed_at, sample.due_at)
|
|
}
|
|
|
|
fn age_ms(now: Instant, then: Instant) -> f64 {
|
|
now.saturating_duration_since(then).as_secs_f64() * 1000.0
|
|
}
|
|
|
|
/// Keeps `signed_duration_ms` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
|
|
/// Inputs are the typed parameters; output is the return value or side effect.
|
|
fn signed_duration_ms(left: Instant, right: Instant) -> f64 {
|
|
if left >= right {
|
|
left.duration_since(right).as_secs_f64() * 1000.0
|
|
} else {
|
|
-(right.duration_since(left).as_secs_f64() * 1000.0)
|
|
}
|
|
}
|
|
|
|
fn delta_ms(left_us: u64, right_us: u64) -> f64 {
|
|
(left_us as i128 - right_us as i128) as f64 / 1000.0
|
|
}
|
|
|
|
/// Keeps `percentile` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
|
|
/// Inputs are the typed parameters; output is the return value or side effect.
|
|
fn percentile(values: impl Iterator<Item = f64>, quantile: f64) -> Option<f64> {
|
|
let mut sorted = values.filter(|value| value.is_finite()).collect::<Vec<_>>();
|
|
if sorted.is_empty() {
|
|
return None;
|
|
}
|
|
sorted.sort_by(|left, right| left.total_cmp(right));
|
|
let index = ((sorted.len() - 1) as f64 * quantile.clamp(0.0, 1.0)).ceil() as usize;
|
|
sorted.get(index).copied()
|
|
}
|
|
|
|
fn upstream_playout_delay() -> Duration {
|
|
let delay_ms = std::env::var("LESAVKA_UPSTREAM_PLAYOUT_DELAY_MS")
|
|
.ok()
|
|
.and_then(|value| value.trim().parse::<u64>().ok())
|
|
.unwrap_or(80);
|
|
Duration::from_millis(delay_ms)
|
|
}
|
|
|
|
/// Keeps `playout_offset_us` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
|
|
/// Inputs are the typed parameters; output is the return value or side effect.
|
|
fn playout_offset_us(kind: UpstreamMediaKind) -> i64 {
|
|
let (scalar_name, mode_map_name, factory_map, factory_offset_us) = match kind {
|
|
UpstreamMediaKind::Camera => (
|
|
"LESAVKA_UPSTREAM_VIDEO_PLAYOUT_OFFSET_US",
|
|
"LESAVKA_UPSTREAM_VIDEO_PLAYOUT_MODE_OFFSETS_US",
|
|
FACTORY_MJPEG_VIDEO_MODE_OFFSETS_US,
|
|
FACTORY_MJPEG_VIDEO_OFFSET_US,
|
|
),
|
|
UpstreamMediaKind::Microphone => (
|
|
"LESAVKA_UPSTREAM_AUDIO_PLAYOUT_OFFSET_US",
|
|
"LESAVKA_UPSTREAM_AUDIO_PLAYOUT_MODE_OFFSETS_US",
|
|
FACTORY_MJPEG_AUDIO_MODE_OFFSETS_US,
|
|
FACTORY_MJPEG_AUDIO_OFFSET_US,
|
|
),
|
|
};
|
|
let mode = current_uvc_mode();
|
|
mode.as_deref()
|
|
.and_then(|mode| env_mode_offset_us(mode_map_name, mode))
|
|
.or_else(|| env_i64(scalar_name))
|
|
.or_else(|| {
|
|
mode.as_deref()
|
|
.and_then(|mode| lookup_mode_offset_us(factory_map, mode))
|
|
})
|
|
.unwrap_or(factory_offset_us)
|
|
}
|
|
|
|
fn current_uvc_mode() -> Option<String> {
|
|
let width = env_u32("LESAVKA_UVC_WIDTH")?;
|
|
let height = env_u32("LESAVKA_UVC_HEIGHT")?;
|
|
let fps = env_u32("LESAVKA_UVC_FPS")
|
|
.or_else(|| {
|
|
env_u32("LESAVKA_UVC_INTERVAL")
|
|
.and_then(|interval| (interval > 0).then_some(10_000_000 / interval))
|
|
})?
|
|
.max(1);
|
|
Some(format!("{width}x{height}@{fps}"))
|
|
}
|
|
|
|
fn env_mode_offset_us(name: &str, mode: &str) -> Option<i64> {
|
|
std::env::var(name)
|
|
.ok()
|
|
.and_then(|map| lookup_mode_offset_us(&map, mode))
|
|
}
|
|
|
|
fn lookup_mode_offset_us(map: &str, mode: &str) -> Option<i64> {
|
|
map.split(',').find_map(|entry| {
|
|
let (key, value) = entry.trim().split_once('=')?;
|
|
(key.trim() == mode)
|
|
.then(|| value.trim().parse::<i64>().ok())
|
|
.flatten()
|
|
})
|
|
}
|
|
|
|
fn env_i64(name: &str) -> Option<i64> {
|
|
std::env::var(name)
|
|
.ok()
|
|
.and_then(|value| value.trim().parse::<i64>().ok())
|
|
}
|
|
|
|
fn env_u32(name: &str) -> Option<u32> {
|
|
std::env::var(name)
|
|
.ok()
|
|
.and_then(|value| value.trim().parse::<u32>().ok())
|
|
}
|
|
|
|
/// Keeps `apply_offset` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
|
|
/// Inputs are the typed parameters; output is the return value or side effect.
|
|
fn apply_offset(instant: Instant, offset_us: i64) -> Instant {
|
|
if offset_us >= 0 {
|
|
instant + Duration::from_micros(offset_us as u64)
|
|
} else {
|
|
instant
|
|
.checked_sub(Duration::from_micros(offset_us.unsigned_abs()))
|
|
.unwrap_or(instant)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
#[path = "upstream_media_runtime/tests/mod.rs"]
|
|
mod tests;
|