media: packetize live microphone uplink

2026-05-02 03:15:19 -03:00 · 2026-05-02 03:15:19 -03:00 · db83f24dde
commit db83f24dde
parent d628c1a634
11 changed files with 141 additions and 41 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@ -228,3 +228,4 @@ Context: 0.16.x proved that queue tweaks and static calibration cannot guarantee
 - 2026-05-02: 0.17.2 mirrored probe and Google Meet test showed major improvement but persistent sub-second late video. Root cause follow-up: the temporary `+350ms` factory MJPEG video playout offset matched the observed browser skew and also made the server skew guard freeze video against its own offset. Patch 0.17.3 restores factory video offset to `0ms`, migrates untouched `+350ms` install/calibration defaults back to `0ms`, and makes the skew guard offset-aware for intentional site calibration.
 - 2026-05-02: 0.17.3 Google Meet manual test improved to roughly sub-second/near-quarter-second lip sync, but the mirrored analyzer could not pair pulses and the user still heard choppy background audio. Client logs showed Pulse microphone packets arriving unevenly with ages around `90-240ms`; patch 0.17.4 lowers Pulse mic `buffer-time`/`latency-time`, bounds the mic queue/appsink, and keeps mirrored-probe after-run planner diagnostics even when analysis fails.
 - 2026-05-02: 0.17.4 mirrored run was salvageable after an SCP banner timeout, but analysis still failed with no close pulse pairs. The client log still showed `180-240ms` microphone delivery ages, pointing at server playout sleeps backpressuring the gRPC microphone stream. Patch 0.17.5 drains inbound microphone packets while waiting for scheduled UAC playout and retries browser-capture SCP fetches.
+- 2026-05-02: 0.17.5 mirrored run still failed with insufficient paired evidence, and the client log still showed recurring `180-240ms` microphone packet age while camera age stayed near zero. Patch 0.17.6 splits oversized mic samples into `20ms` timestamped packets and keeps a short fresh server-side audio window instead of collapsing every pending burst to one newest chunk, aiming to preserve lip sync without making background audio choppy.
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1652,7 +1652,7 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"

 [[package]]
 name = "lesavka_client"
-version = "0.17.5"
+version = "0.17.6"
 dependencies = [
 "anyhow",
 "async-stream",
@ -1686,7 +1686,7 @@ dependencies = [

 [[package]]
 name = "lesavka_common"
-version = "0.17.5"
+version = "0.17.6"
 dependencies = [
 "anyhow",
 "base64",
@ -1698,7 +1698,7 @@ dependencies = [

 [[package]]
 name = "lesavka_server"
-version = "0.17.5"
+version = "0.17.6"
 dependencies = [
 "anyhow",
 "base64",
--- a/client/Cargo.toml
+++ b/client/Cargo.toml
@ -4,7 +4,7 @@ path                    = "src/main.rs"

 [package]
 name                    = "lesavka_client"
-version                 = "0.17.5"
+version                 = "0.17.6"
 edition                 = "2024"

 [dependencies]
--- a/client/src/input/microphone.rs
+++ b/client/src/input/microphone.rs
@ -8,9 +8,10 @@ use shell_escape::unix::escape;
 #[cfg(not(coverage))]
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::{
+    collections::VecDeque,
    path::{Path as StdPath, PathBuf},
    sync::{
-        Arc,
+        Arc, Mutex,
        atomic::{AtomicBool, Ordering as AtomicOrdering},
    },
    thread,
@ -25,11 +26,13 @@ const MIC_GAIN_CONTROL_ENV: &str = "LESAVKA_MIC_GAIN_CONTROL";
 const MIC_LEVEL_TAP_ENV: &str = "LESAVKA_UPLINK_MIC_LEVEL";
 const MIC_PULSE_BUFFER_TIME_ENV: &str = "LESAVKA_MIC_PULSE_BUFFER_TIME_US";
 const MIC_PULSE_LATENCY_TIME_ENV: &str = "LESAVKA_MIC_PULSE_LATENCY_TIME_US";
+const MIC_PACKET_TARGET_DURATION_ENV: &str = "LESAVKA_MIC_PACKET_TARGET_US";
 const MIC_SAMPLE_RATE: u64 = 48_000;
 const MIC_CHANNELS: usize = 2;
 const MIC_SAMPLE_BYTES: usize = std::mem::size_of::<i16>();
 const DEFAULT_MIC_PULSE_BUFFER_TIME_US: u64 = 40_000;
 const DEFAULT_MIC_PULSE_LATENCY_TIME_US: u64 = 10_000;
+const DEFAULT_MIC_PACKET_TARGET_DURATION_US: u64 = 20_000;
 const MIC_MAIN_QUEUE_MAX_BUFFERS: u32 = 8;
 const MIC_MAIN_QUEUE_MAX_TIME_NS: u64 = 80_000_000;
 const MIC_APPSINK_MAX_BUFFERS: u32 = 8;
@ -40,6 +43,7 @@ pub struct MicrophoneCapture {
    sink: gst_app::AppSink,
    level_tap_running: Option<Arc<AtomicBool>>,
    pts_rebaser: crate::live_capture_clock::DurationPacedSourcePtsRebaser,
+    pending_packets: Mutex<VecDeque<AudioPacket>>,
 }

 impl MicrophoneCapture {
@ -119,11 +123,15 @@ impl MicrophoneCapture {
            sink,
            level_tap_running,
            pts_rebaser: crate::live_capture_clock::DurationPacedSourcePtsRebaser::default(),
+            pending_packets: Mutex::default(),
        })
    }

    /// Blocking pull; call from an async wrapper
    pub fn pull(&self) -> Option<AudioPacket> {
+        if let Some(packet) = self.pending_packets.lock().ok()?.pop_front() {
+            return Some(packet);
+        }
        match self.sink.pull_sample() {
            Ok(sample) => {
                let buf = sample.buffer().unwrap();
@ -136,6 +144,10 @@ impl MicrophoneCapture {
                    crate::live_capture_clock::upstream_source_lag_cap(),
                );
                let pts = timing.packet_pts_us;
+                let target_bytes = mic_packet_target_bytes();
+                let mut packets = split_audio_sample(pts, map.as_slice(), target_bytes);
+                let packet_count = packets.len();
+                let first_packet = packets.pop_front();
                #[cfg(not(coverage))]
                {
                    static CNT: AtomicU64 = AtomicU64::new(0);
@ -155,18 +167,26 @@ impl MicrophoneCapture {
                            used_source_pts = timing.used_source_pts,
                            lag_clamped = timing.lag_clamped,
                            bytes = map.len(),
+                            packet_duration_us,
+                            split_packets = packet_count,
+                            target_packet_bytes = target_bytes,
                            "🎤 upstream microphone timing sample"
                        );
                    }
                    if n < 10 || n.is_multiple_of(300) {
-                        trace!("🎤⇧ cli  pkt#{n}  {} bytes", map.len());
+                        trace!(
+                            "🎤⇧ cli sample#{n} {} bytes -> {} packet(s)",
+                            map.len(),
+                            packet_count
+                        );
                    }
                }
-                Some(AudioPacket {
-                    id: 0,
-                    pts,
-                    data: map.as_slice().to_vec(),
-                })
+                if !packets.is_empty()
+                    && let Ok(mut pending) = self.pending_packets.lock()
+                {
+                    pending.extend(packets);
+                }
+                first_packet
            }
            Err(_) => None,
        }
@ -330,6 +350,48 @@ fn pcm_payload_duration_us(bytes: usize) -> u64 {
    ((frames as u128 * 1_000_000u128) / MIC_SAMPLE_RATE as u128).min(u64::MAX as u128) as u64
 }

+fn split_audio_sample(base_pts_us: u64, data: &[u8], target_bytes: usize) -> VecDeque<AudioPacket> {
+    let frame_bytes = (MIC_CHANNELS * MIC_SAMPLE_BYTES).max(1);
+    let target_bytes = frame_aligned_packet_bytes(target_bytes.max(frame_bytes));
+    let mut packets = VecDeque::new();
+    let mut offset = 0usize;
+    while offset < data.len() {
+        let remaining = data.len() - offset;
+        let mut take = remaining.min(target_bytes);
+        if remaining > take {
+            take -= take % frame_bytes;
+            if take == 0 {
+                take = frame_bytes.min(remaining);
+            }
+        }
+        let end = offset.saturating_add(take).min(data.len());
+        if end == offset {
+            break;
+        }
+        packets.push_back(AudioPacket {
+            id: 0,
+            pts: base_pts_us.saturating_add(pcm_payload_duration_us(offset)),
+            data: data[offset..end].to_vec(),
+        });
+        offset = end;
+    }
+    packets
+}
+
+fn mic_packet_target_bytes() -> usize {
+    let frame_bytes = MIC_CHANNELS * MIC_SAMPLE_BYTES;
+    let target_us = mic_packet_target_duration_us().clamp(1_000, 100_000);
+    let frames = ((MIC_SAMPLE_RATE as u128 * target_us as u128) / 1_000_000u128)
+        .max(1)
+        .min(usize::MAX as u128) as usize;
+    frame_aligned_packet_bytes(frames.saturating_mul(frame_bytes))
+}
+
+fn frame_aligned_packet_bytes(bytes: usize) -> usize {
+    let frame_bytes = (MIC_CHANNELS * MIC_SAMPLE_BYTES).max(1);
+    ((bytes / frame_bytes).max(1)).saturating_mul(frame_bytes)
+}
+
 /// Rejects bogus capture timestamps before they can poison mic PTS rebasing.
 fn duration_matches_pcm_payload(reported_us: u64, payload_us: u64) -> bool {
    if reported_us == 0 {
@ -354,6 +416,13 @@ fn mic_pulse_latency_time_us() -> u64 {
    )
 }

+fn mic_packet_target_duration_us() -> u64 {
+    positive_u64_env(
+        MIC_PACKET_TARGET_DURATION_ENV,
+        DEFAULT_MIC_PACKET_TARGET_DURATION_US,
+    )
+}
+
 fn positive_u64_env(name: &str, default_value: u64) -> u64 {
    std::env::var(name)
        .ok()
@ -464,7 +533,7 @@ impl Drop for MicrophoneCapture {
 mod tests {
    use super::{
        MIC_CHANNELS, MIC_SAMPLE_BYTES, MIC_SAMPLE_RATE, buffer_duration_us,
-        pcm_payload_duration_us,
+        mic_packet_target_bytes, pcm_payload_duration_us, split_audio_sample,
    };
    use gstreamer as gst;

@ -508,4 +577,35 @@ mod tests {

        assert_eq!(buffer_duration_us(buffer.as_ref(), bytes), 20_000);
    }
+
+    #[test]
+    fn oversized_microphone_samples_split_into_live_sized_packets() {
+        let bytes_per_frame = MIC_CHANNELS * MIC_SAMPLE_BYTES;
+        let hundred_ms_bytes = (MIC_SAMPLE_RATE as usize / 10) * bytes_per_frame;
+        let data = vec![7_u8; hundred_ms_bytes];
+
+        let packets = split_audio_sample(1_000_000, &data, mic_packet_target_bytes());
+
+        assert_eq!(packets.len(), 5);
+        assert!(packets.iter().all(|packet| packet.id == 0));
+        assert!(packets.iter().all(|packet| packet.data.len() == 3_840));
+        assert_eq!(packets.front().map(|packet| packet.pts), Some(1_000_000));
+        assert_eq!(packets.get(1).map(|packet| packet.pts), Some(1_020_000));
+        assert_eq!(packets.back().map(|packet| packet.pts), Some(1_080_000));
+    }
+
+    #[test]
+    fn trailing_microphone_packet_keeps_remaining_bytes() {
+        let bytes_per_frame = MIC_CHANNELS * MIC_SAMPLE_BYTES;
+        let forty_five_ms_bytes = ((MIC_SAMPLE_RATE as usize * 45) / 1_000) * bytes_per_frame;
+        let data = vec![9_u8; forty_five_ms_bytes];
+
+        let packets = split_audio_sample(5_000, &data, mic_packet_target_bytes());
+
+        assert_eq!(packets.len(), 3);
+        assert_eq!(packets[0].data.len(), 3_840);
+        assert_eq!(packets[1].data.len(), 3_840);
+        assert_eq!(packets[2].data.len(), 960);
+        assert_eq!(packets[2].pts, 45_000);
+    }
 }
--- a/common/Cargo.toml
+++ b/common/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name                    = "lesavka_common"
-version                 = "0.17.5"
+version                 = "0.17.6"
 edition                 = "2024"
 build                   = "build.rs"

--- a/scripts/manual/run_upstream_mirrored_av_sync.sh
+++ b/scripts/manual/run_upstream_mirrored_av_sync.sh
@ -269,6 +269,7 @@ start_real_lesavka_client() {
    LESAVKA_SERVER_ADDR="${RESOLVED_LESAVKA_SERVER_ADDR}" \
    LESAVKA_TLS_DOMAIN="${LESAVKA_TLS_DOMAIN}" \
    LESAVKA_MEDIA_CONTROL="${MEDIA_CONTROL}" \
+    LESAVKA_UPSTREAM_TIMING_TRACE="${LESAVKA_UPSTREAM_TIMING_TRACE:-1}" \
    RUST_LOG="${RUST_LOG:-warn,lesavka_client::app=info,lesavka_client::input::camera=info,lesavka_client::input::microphone=info}" \
    "${REPO_ROOT}/target/debug/lesavka-client" --no-launcher --server "${RESOLVED_LESAVKA_SERVER_ADDR}"
  ) >"${CLIENT_LOG}" 2>&1 &
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@ -10,7 +10,7 @@ bench                   = false

 [package]
 name                    = "lesavka_server"
-version                 = "0.17.5"
+version                 = "0.17.6"
 edition                 = "2024"
 autobins                = false

--- a/server/src/main/relay_service_coverage.rs
+++ b/server/src/main/relay_service_coverage.rs
@ -21,17 +21,18 @@ fn retain_freshest_video_packet(
    dropped
 }

+#[cfg(coverage)]
+const AUDIO_PENDING_LIVE_WINDOW_PACKETS: usize = 8;
+
 #[cfg(coverage)]
 fn retain_freshest_audio_packet(
    pending: &mut std::collections::VecDeque<AudioPacket>,
 ) -> usize {
-    if pending.len() <= 1 {
+    if pending.len() <= AUDIO_PENDING_LIVE_WINDOW_PACKETS {
        return 0;
    }
-    let newest = pending.pop_back().expect("non-empty pending audio queue");
-    let dropped = pending.len();
-    pending.clear();
-    pending.push_back(newest);
+    let dropped = pending.len() - AUDIO_PENDING_LIVE_WINDOW_PACKETS;
+    pending.drain(..dropped);
    dropped
 }

--- a/server/src/main/relay_service_tests.rs
+++ b/server/src/main/relay_service_tests.rs
@ -31,27 +31,20 @@ mod tests {
    }

    #[test]
-    fn retain_freshest_audio_packet_keeps_only_the_latest_chunk() {
-        let mut pending = std::collections::VecDeque::from(vec![
-            AudioPacket {
-                pts: 100,
+    fn retain_freshest_audio_packet_keeps_a_short_live_window() {
+        let mut pending = (0..10)
+            .map(|idx| AudioPacket {
+                pts: idx * 100,
                ..Default::default()
-            },
-            AudioPacket {
-                pts: 200,
-                ..Default::default()
-            },
-            AudioPacket {
-                pts: 300,
-                ..Default::default()
-            },
-        ]);
+            })
+            .collect::<std::collections::VecDeque<_>>();

        let dropped = retain_freshest_audio_packet(&mut pending);

        assert_eq!(dropped, 2);
-        assert_eq!(pending.len(), 1);
-        assert_eq!(pending.front().map(|pkt| pkt.pts), Some(300));
+        assert_eq!(pending.len(), 8);
+        assert_eq!(pending.front().map(|pkt| pkt.pts), Some(200));
+        assert_eq!(pending.back().map(|pkt| pkt.pts), Some(900));
    }

    #[test]
--- a/server/src/main/relay_stream_lifecycle.rs
+++ b/server/src/main/relay_stream_lifecycle.rs
@ -23,17 +23,19 @@ fn retain_freshest_video_packet(
 }

 #[cfg(not(coverage))]
-/// Keeps only the newest microphone packet while startup pairing is healing.
+const AUDIO_PENDING_LIVE_WINDOW_PACKETS: usize = 8;
+
+#[cfg(not(coverage))]
+/// Keeps a tiny newest microphone window so playout can stay smooth without
+/// draining old audio.
 fn retain_freshest_audio_packet(
    pending: &mut std::collections::VecDeque<AudioPacket>,
 ) -> usize {
-    if pending.len() <= 1 {
+    if pending.len() <= AUDIO_PENDING_LIVE_WINDOW_PACKETS {
        return 0;
    }
-    let newest = pending.pop_back().expect("non-empty pending audio queue");
-    let dropped = pending.len();
-    pending.clear();
-    pending.push_back(newest);
+    let dropped = pending.len() - AUDIO_PENDING_LIVE_WINDOW_PACKETS;
+    pending.drain(..dropped);
    dropped
 }

--- a/testing/tests/client_microphone_include_contract.rs
+++ b/testing/tests/client_microphone_include_contract.rs
@ -313,6 +313,7 @@ JSON
            sink,
            level_tap_running: Some(std::sync::Arc::clone(&running)),
            pts_rebaser: crate::live_capture_clock::DurationPacedSourcePtsRebaser::default(),
+            pending_packets: Default::default(),
        };
        assert!(
            cap.pull().is_none(),
@ -436,6 +437,7 @@ JSON
            sink,
            level_tap_running: None,
            pts_rebaser: crate::live_capture_clock::DurationPacedSourcePtsRebaser::default(),
+            pending_packets: Default::default(),
        };
        let first_pkt = cap.pull().expect("first audio packet");
        let second_pkt = cap.pull().expect("second audio packet");