lesavka/server/src/upstream_media_runtime.rs

444 lines
15 KiB
Rust
Raw Normal View History

#![forbid(unsafe_code)]
2026-05-03 12:22:33 -03:00
use std::collections::VecDeque;
2026-04-30 08:16:57 -03:00
use std::sync::atomic::{AtomicI64, AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use std::time::Duration;
2026-05-03 12:22:33 -03:00
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
use tokio::time::Instant;
2026-05-03 12:22:33 -03:00
2026-05-06 03:59:20 -03:00
use crate::calibration::{
FACTORY_MJPEG_AUDIO_MODE_OFFSETS_US, FACTORY_MJPEG_AUDIO_OFFSET_US,
FACTORY_MJPEG_VIDEO_MODE_OFFSETS_US, FACTORY_MJPEG_VIDEO_OFFSET_US,
};
2026-05-03 12:22:33 -03:00
const TIMING_WINDOW_CAPACITY: usize = 240;
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum UpstreamMediaKind {
Camera,
Microphone,
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub struct UpstreamClientTiming {
pub capture_pts_us: u64,
pub send_pts_us: u64,
pub queue_depth: u32,
pub queue_age_ms: u32,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct UpstreamStreamLease {
pub session_id: u64,
pub generation: u64,
}
#[derive(Clone, Copy, Debug)]
pub struct PlannedUpstreamPacket {
pub local_pts_us: u64,
pub due_at: Instant,
pub late_by: Duration,
pub source_lag: Duration,
}
#[derive(Clone, Copy, Debug)]
pub enum UpstreamPlanDecision {
AwaitingPair,
DropBeforeOverlap,
DropStale(&'static str),
StartupFailed(&'static str),
Play(PlannedUpstreamPacket),
}
#[derive(Clone, Debug)]
pub struct UpstreamPlannerSnapshot {
pub session_id: u64,
pub phase: &'static str,
pub latest_camera_remote_pts_us: Option<u64>,
pub latest_microphone_remote_pts_us: Option<u64>,
pub last_video_presented_pts_us: Option<u64>,
pub last_audio_presented_pts_us: Option<u64>,
pub live_lag_ms: Option<f64>,
pub planner_skew_ms: Option<f64>,
pub stale_audio_drops: u64,
pub stale_video_drops: u64,
pub skew_video_drops: u64,
pub freshness_reanchors: u64,
pub startup_timeouts: u64,
pub video_freezes: u64,
pub last_reason: String,
pub client_capture_skew_ms: Option<f64>,
pub client_send_skew_ms: Option<f64>,
pub server_receive_skew_ms: Option<f64>,
pub camera_client_queue_age_ms: Option<f64>,
pub microphone_client_queue_age_ms: Option<f64>,
pub camera_server_receive_age_ms: Option<f64>,
pub microphone_server_receive_age_ms: Option<f64>,
pub client_capture_abs_skew_p95_ms: Option<f64>,
pub client_send_abs_skew_p95_ms: Option<f64>,
pub server_receive_abs_skew_p95_ms: Option<f64>,
pub camera_client_queue_age_p95_ms: Option<f64>,
pub microphone_client_queue_age_p95_ms: Option<f64>,
pub sink_handoff_skew_ms: Option<f64>,
pub sink_handoff_abs_skew_p95_ms: Option<f64>,
pub camera_sink_late_ms: Option<f64>,
pub microphone_sink_late_ms: Option<f64>,
pub camera_sink_late_p95_ms: Option<f64>,
pub microphone_sink_late_p95_ms: Option<f64>,
pub client_timing_window_samples: u64,
pub sink_handoff_window_samples: u64,
}
#[derive(Clone, Copy, Debug)]
struct TimingSample {
capture_pts_us: u64,
send_pts_us: u64,
queue_age_ms: u32,
received_at: Instant,
}
#[derive(Clone, Copy, Debug)]
struct PresentationSample {
due_at: Instant,
handed_at: Instant,
}
#[derive(Debug, Default)]
struct ScalarWindow {
values: VecDeque<f64>,
}
impl ScalarWindow {
/// Keeps `push` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
/// Inputs are the typed parameters; output is the return value or side effect.
2026-05-03 12:22:33 -03:00
fn push(&mut self, value: f64) {
if self.values.len() >= TIMING_WINDOW_CAPACITY {
self.values.pop_front();
}
self.values.push_back(value);
}
fn p95(&self) -> Option<f64> {
percentile(self.values.iter().copied(), 0.95)
}
fn p95_abs(&self) -> Option<f64> {
percentile(self.values.iter().map(|value| value.abs()), 0.95)
}
fn len(&self) -> usize {
self.values.len()
}
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
2026-05-03 12:22:33 -03:00
enum UpstreamSyncPhase {
#[default]
2026-05-03 12:22:33 -03:00
Acquiring,
Syncing,
Live,
Healing,
}
impl UpstreamSyncPhase {
/// Keeps `as_str` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
/// Inputs are the typed parameters; output is the return value or side effect.
2026-05-03 12:22:33 -03:00
fn as_str(self) -> &'static str {
match self {
Self::Acquiring => "acquiring",
Self::Syncing => "syncing",
Self::Live => "live",
Self::Healing => "healing",
}
}
}
#[derive(Debug, Default)]
struct RuntimeState {
session_id: u64,
active_camera_generation: Option<u64>,
active_microphone_generation: Option<u64>,
phase: UpstreamSyncPhase,
session_started_at: Option<Instant>,
base_remote_pts_us: Option<u64>,
playout_epoch: Option<Instant>,
latest_camera_remote_pts_us: Option<u64>,
latest_microphone_remote_pts_us: Option<u64>,
last_video_local_pts_us: Option<u64>,
last_audio_local_pts_us: Option<u64>,
last_video_presented_pts_us: Option<u64>,
last_audio_presented_pts_us: Option<u64>,
latest_camera_timing: Option<TimingSample>,
latest_microphone_timing: Option<TimingSample>,
latest_camera_presentation: Option<PresentationSample>,
latest_microphone_presentation: Option<PresentationSample>,
latest_paired_client_capture_skew_ms: Option<f64>,
latest_paired_client_send_skew_ms: Option<f64>,
latest_paired_server_receive_skew_ms: Option<f64>,
client_capture_skew_window_ms: ScalarWindow,
client_send_skew_window_ms: ScalarWindow,
server_receive_skew_window_ms: ScalarWindow,
camera_client_queue_age_window_ms: ScalarWindow,
microphone_client_queue_age_window_ms: ScalarWindow,
sink_handoff_skew_window_ms: ScalarWindow,
camera_sink_late_window_ms: ScalarWindow,
microphone_sink_late_window_ms: ScalarWindow,
stale_audio_drops: u64,
stale_video_drops: u64,
skew_video_drops: u64,
freshness_reanchors: u64,
startup_timeouts: u64,
video_freezes: u64,
last_reason: String,
}
#[derive(Debug)]
pub struct UpstreamMediaRuntime {
next_session_id: AtomicU64,
next_camera_generation: AtomicU64,
next_microphone_generation: AtomicU64,
microphone_sink_gate: Arc<Semaphore>,
2026-04-30 08:16:57 -03:00
camera_playout_offset_us: AtomicI64,
microphone_playout_offset_us: AtomicI64,
2026-05-03 12:22:33 -03:00
state: Mutex<RuntimeState>,
}
include!("upstream_media_runtime/stream_lifecycle_methods.rs");
include!("upstream_media_runtime/planner_snapshot_methods.rs");
include!("upstream_media_runtime/playout_planning_methods.rs");
2026-05-03 12:22:33 -03:00
impl Default for UpstreamMediaRuntime {
fn default() -> Self {
Self::new()
}
}
/// Keeps `reset_session_state` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
/// Inputs are the typed parameters; output is the return value or side effect.
2026-05-03 12:22:33 -03:00
fn reset_session_state(state: &mut RuntimeState) {
state.base_remote_pts_us = None;
state.playout_epoch = None;
state.latest_camera_remote_pts_us = None;
state.latest_microphone_remote_pts_us = None;
state.last_video_local_pts_us = None;
state.last_audio_local_pts_us = None;
state.last_video_presented_pts_us = None;
state.last_audio_presented_pts_us = None;
state.latest_camera_timing = None;
state.latest_microphone_timing = None;
state.latest_camera_presentation = None;
state.latest_microphone_presentation = None;
state.latest_paired_client_capture_skew_ms = None;
state.latest_paired_client_send_skew_ms = None;
state.latest_paired_server_receive_skew_ms = None;
state.stale_audio_drops = 0;
state.stale_video_drops = 0;
state.skew_video_drops = 0;
state.freshness_reanchors = 0;
state.startup_timeouts = 0;
state.video_freezes = 0;
}
/// Keeps `record_timing_pair` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
/// Inputs are the typed parameters; output is the return value or side effect.
2026-05-03 12:22:33 -03:00
fn record_timing_pair(state: &mut RuntimeState) {
let (Some(camera), Some(microphone)) =
(state.latest_camera_timing, state.latest_microphone_timing)
else {
return;
};
2026-05-03 12:22:33 -03:00
let capture_skew_ms = delta_ms(microphone.capture_pts_us, camera.capture_pts_us);
let send_skew_ms = delta_ms(microphone.send_pts_us, camera.send_pts_us);
let receive_skew_ms = signed_duration_ms(microphone.received_at, camera.received_at);
state.latest_paired_client_capture_skew_ms = Some(capture_skew_ms);
state.latest_paired_client_send_skew_ms = Some(send_skew_ms);
state.latest_paired_server_receive_skew_ms = Some(receive_skew_ms);
state.client_capture_skew_window_ms.push(capture_skew_ms);
state.client_send_skew_window_ms.push(send_skew_ms);
state.server_receive_skew_window_ms.push(receive_skew_ms);
}
/// Keeps `record_presentation` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
/// Inputs are the typed parameters; output is the return value or side effect.
2026-05-03 12:22:33 -03:00
fn record_presentation(state: &mut RuntimeState, kind: UpstreamMediaKind, due_at: Instant) {
let sample = PresentationSample {
due_at,
handed_at: Instant::now(),
};
match kind {
UpstreamMediaKind::Camera => {
state.latest_camera_presentation = Some(sample);
2026-05-03 12:22:33 -03:00
state
.camera_sink_late_window_ms
.push(presentation_late_ms(sample));
}
UpstreamMediaKind::Microphone => {
state.latest_microphone_presentation = Some(sample);
2026-05-03 12:22:33 -03:00
state
.microphone_sink_late_window_ms
.push(presentation_late_ms(sample));
}
}
2026-05-03 12:22:33 -03:00
if let Some(skew) = latest_sink_handoff_skew_ms(state) {
state.sink_handoff_skew_window_ms.push(skew);
}
}
2026-05-03 12:22:33 -03:00
fn live_lag_ms(state: &RuntimeState) -> Option<f64> {
let latest = state
.latest_camera_remote_pts_us
.into_iter()
.chain(state.latest_microphone_remote_pts_us)
.max()?;
let base = state.base_remote_pts_us.unwrap_or(latest);
Some(latest.saturating_sub(base) as f64 / 1000.0)
}
/// Keeps `planner_skew_ms` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
/// Inputs are the typed parameters; output is the return value or side effect.
2026-05-03 12:22:33 -03:00
fn planner_skew_ms(state: &RuntimeState) -> Option<f64> {
match (
state.last_audio_presented_pts_us,
state.last_video_presented_pts_us,
) {
(Some(audio), Some(video)) => Some((audio as i128 - video as i128) as f64 / 1000.0),
_ => None,
}
}
2026-05-03 12:22:33 -03:00
fn latest_sink_handoff_skew_ms(state: &RuntimeState) -> Option<f64> {
let camera = state.latest_camera_presentation?;
let microphone = state.latest_microphone_presentation?;
Some(presentation_late_signed_ms(microphone) - presentation_late_signed_ms(camera))
}
2026-05-03 12:22:33 -03:00
fn presentation_late_ms(sample: PresentationSample) -> f64 {
presentation_late_signed_ms(sample).max(0.0)
}
fn presentation_late_signed_ms(sample: PresentationSample) -> f64 {
signed_duration_ms(sample.handed_at, sample.due_at)
}
fn age_ms(now: Instant, then: Instant) -> f64 {
now.saturating_duration_since(then).as_secs_f64() * 1000.0
}
/// Keeps `signed_duration_ms` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
/// Inputs are the typed parameters; output is the return value or side effect.
2026-05-03 12:22:33 -03:00
fn signed_duration_ms(left: Instant, right: Instant) -> f64 {
if left >= right {
left.duration_since(right).as_secs_f64() * 1000.0
} else {
-(right.duration_since(left).as_secs_f64() * 1000.0)
2026-05-01 12:03:07 -03:00
}
}
2026-05-03 12:22:33 -03:00
fn delta_ms(left_us: u64, right_us: u64) -> f64 {
(left_us as i128 - right_us as i128) as f64 / 1000.0
}
/// Keeps `percentile` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
/// Inputs are the typed parameters; output is the return value or side effect.
2026-05-03 12:22:33 -03:00
fn percentile(values: impl Iterator<Item = f64>, quantile: f64) -> Option<f64> {
let mut sorted = values.filter(|value| value.is_finite()).collect::<Vec<_>>();
if sorted.is_empty() {
return None;
}
2026-05-03 12:22:33 -03:00
sorted.sort_by(|left, right| left.total_cmp(right));
let index = ((sorted.len() - 1) as f64 * quantile.clamp(0.0, 1.0)).ceil() as usize;
sorted.get(index).copied()
}
fn upstream_playout_delay() -> Duration {
let delay_ms = std::env::var("LESAVKA_UPSTREAM_PLAYOUT_DELAY_MS")
.ok()
.and_then(|value| value.trim().parse::<u64>().ok())
.unwrap_or(80);
Duration::from_millis(delay_ms)
}
/// Keeps `playout_offset_us` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
/// Inputs are the typed parameters; output is the return value or side effect.
2026-05-03 12:22:33 -03:00
fn playout_offset_us(kind: UpstreamMediaKind) -> i64 {
2026-05-06 03:59:20 -03:00
let (scalar_name, mode_map_name, factory_map, factory_offset_us) = match kind {
UpstreamMediaKind::Camera => (
"LESAVKA_UPSTREAM_VIDEO_PLAYOUT_OFFSET_US",
"LESAVKA_UPSTREAM_VIDEO_PLAYOUT_MODE_OFFSETS_US",
FACTORY_MJPEG_VIDEO_MODE_OFFSETS_US,
FACTORY_MJPEG_VIDEO_OFFSET_US,
),
UpstreamMediaKind::Microphone => (
"LESAVKA_UPSTREAM_AUDIO_PLAYOUT_OFFSET_US",
"LESAVKA_UPSTREAM_AUDIO_PLAYOUT_MODE_OFFSETS_US",
FACTORY_MJPEG_AUDIO_MODE_OFFSETS_US,
FACTORY_MJPEG_AUDIO_OFFSET_US,
),
2026-05-03 12:22:33 -03:00
};
2026-05-06 03:59:20 -03:00
let mode = current_uvc_mode();
mode.as_deref()
.and_then(|mode| env_mode_offset_us(mode_map_name, mode))
.or_else(|| env_i64(scalar_name))
.or_else(|| {
mode.as_deref()
.and_then(|mode| lookup_mode_offset_us(factory_map, mode))
})
.unwrap_or(factory_offset_us)
}
fn current_uvc_mode() -> Option<String> {
let width = env_u32("LESAVKA_UVC_WIDTH")?;
let height = env_u32("LESAVKA_UVC_HEIGHT")?;
let fps = env_u32("LESAVKA_UVC_FPS")
.or_else(|| {
env_u32("LESAVKA_UVC_INTERVAL")
.and_then(|interval| (interval > 0).then_some(10_000_000 / interval))
})?
.max(1);
Some(format!("{width}x{height}@{fps}"))
}
fn env_mode_offset_us(name: &str, mode: &str) -> Option<i64> {
std::env::var(name)
.ok()
.and_then(|map| lookup_mode_offset_us(&map, mode))
}
fn lookup_mode_offset_us(map: &str, mode: &str) -> Option<i64> {
map.split(',').find_map(|entry| {
let (key, value) = entry.trim().split_once('=')?;
(key.trim() == mode)
.then(|| value.trim().parse::<i64>().ok())
.flatten()
})
}
fn env_i64(name: &str) -> Option<i64> {
2026-05-03 12:22:33 -03:00
std::env::var(name)
.ok()
.and_then(|value| value.trim().parse::<i64>().ok())
2026-05-06 03:59:20 -03:00
}
fn env_u32(name: &str) -> Option<u32> {
std::env::var(name)
.ok()
.and_then(|value| value.trim().parse::<u32>().ok())
2026-05-03 12:22:33 -03:00
}
/// Keeps `apply_offset` explicit because it sits on server upstream media scheduling, where timing choices directly affect lip sync.
/// Inputs are the typed parameters; output is the return value or side effect.
2026-05-03 12:22:33 -03:00
fn apply_offset(instant: Instant, offset_us: i64) -> Instant {
if offset_us >= 0 {
instant + Duration::from_micros(offset_us as u64)
} else {
instant
.checked_sub(Duration::from_micros(offset_us.unsigned_abs()))
.unwrap_or(instant)
}
}
#[cfg(test)]
#[path = "upstream_media_runtime/tests/mod.rs"]
mod tests;