lesavka/client/src/input/microphone.rs

385 lines
14 KiB
Rust

// client/src/input/microphone.rs
use anyhow::{Context, Result, bail};
use gst::prelude::*;
use gstreamer as gst;
use gstreamer_app as gst_app;
use lesavka_common::{
audio_transport::{self, UpstreamAudioCodec},
lesavka::AudioPacket,
};
use shell_escape::unix::escape;
#[cfg(not(coverage))]
use std::sync::atomic::{AtomicU64, Ordering};
use std::{
collections::VecDeque,
path::{Path as StdPath, PathBuf},
sync::{
Arc, Mutex,
atomic::{AtomicBool, Ordering as AtomicOrdering},
},
thread,
time::Duration,
};
use tracing::{debug, info, warn};
#[cfg(not(coverage))]
use tracing::{error, trace};
const MIC_GAIN_ENV: &str = "LESAVKA_MIC_GAIN";
const MIC_GAIN_CONTROL_ENV: &str = "LESAVKA_MIC_GAIN_CONTROL";
const MIC_LEVEL_TAP_ENV: &str = "LESAVKA_UPLINK_MIC_LEVEL";
const MIC_PULSE_BUFFER_TIME_ENV: &str = "LESAVKA_MIC_PULSE_BUFFER_TIME_US";
const MIC_PULSE_LATENCY_TIME_ENV: &str = "LESAVKA_MIC_PULSE_LATENCY_TIME_US";
const MIC_PACKET_TARGET_DURATION_ENV: &str = "LESAVKA_MIC_PACKET_TARGET_US";
const REQUIRE_EXPLICIT_MEDIA_SOURCES_ENV: &str = "LESAVKA_REQUIRE_EXPLICIT_MEDIA_SOURCES";
const MIC_NOISE_SUPPRESSION_ENV: &str = "LESAVKA_MIC_NOISE_SUPPRESSION";
const MIC_NOISE_SUPPRESSION_LEVEL_ENV: &str = "LESAVKA_MIC_NOISE_SUPPRESSION_LEVEL";
const MIC_SAMPLE_RATE: u64 = 48_000;
const MIC_CHANNELS: usize = 2;
const MIC_SAMPLE_BYTES: usize = std::mem::size_of::<i16>();
const DEFAULT_MIC_PULSE_BUFFER_TIME_US: u64 = 40_000;
const DEFAULT_MIC_PULSE_LATENCY_TIME_US: u64 = 10_000;
const DEFAULT_MIC_PACKET_TARGET_DURATION_US: u64 = 20_000;
const MIC_MAIN_QUEUE_MAX_BUFFERS: u32 = 8;
const MIC_MAIN_QUEUE_MAX_TIME_NS: u64 = 80_000_000;
const MIC_APPSINK_MAX_BUFFERS: u32 = 8;
pub struct MicrophoneCapture {
#[allow(dead_code)] // kept alive to hold PLAYING state
pipeline: gst::Pipeline,
sink: gst_app::AppSink,
level_tap_running: Option<Arc<AtomicBool>>,
pts_rebaser: crate::live_capture_clock::DurationPacedSourcePtsRebaser,
pending_packets: Mutex<VecDeque<AudioPacket>>,
audio_encoder: Mutex<Option<crate::input::audio_codec::OpusPacketEncoder>>,
}
include!("microphone/capture_runtime.rs");
fn mic_level_tap_path() -> Option<PathBuf> {
std::env::var(MIC_LEVEL_TAP_ENV)
.ok()
.map(|value| value.trim().to_string())
.filter(|value| !value.is_empty())
.map(PathBuf::from)
}
/// Keeps `microphone_pipeline_desc` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
/// Inputs are the typed parameters; output is the return value or side effect.
fn microphone_pipeline_desc(
source_desc: &str,
gain: f64,
level_tap_enabled: bool,
noise_suppression: bool,
) -> String {
let gain = format_mic_gain_for_gst(gain);
let noise_stage = microphone_noise_suppression_stage(noise_suppression);
if level_tap_enabled {
format!(
"{source_desc} ! \
audioconvert ! audioresample ! \
audio/x-raw,format=S16LE,channels={MIC_CHANNELS},rate={MIC_SAMPLE_RATE} ! \
{noise_stage}\
volume name=mic_input_gain volume={gain} ! \
tee name=t \
t. ! queue max-size-buffers={MIC_MAIN_QUEUE_MAX_BUFFERS} max-size-time={MIC_MAIN_QUEUE_MAX_TIME_NS} leaky=downstream ! \
audio/x-raw,format=S16LE,channels={MIC_CHANNELS},rate={MIC_SAMPLE_RATE} ! \
appsink name=asink emit-signals=true max-buffers={MIC_APPSINK_MAX_BUFFERS} drop=true \
t. ! queue max-size-buffers=8 leaky=downstream ! \
audio/x-raw,format=S16LE,channels={MIC_CHANNELS},rate={MIC_SAMPLE_RATE} ! \
appsink name=level_sink emit-signals=false sync=false max-buffers=8 drop=true"
)
} else {
format!(
"{source_desc} ! \
audioconvert ! audioresample ! \
audio/x-raw,format=S16LE,channels={MIC_CHANNELS},rate={MIC_SAMPLE_RATE} ! \
{noise_stage}\
volume name=mic_input_gain volume={gain} ! \
queue max-size-buffers={MIC_MAIN_QUEUE_MAX_BUFFERS} max-size-time={MIC_MAIN_QUEUE_MAX_TIME_NS} leaky=downstream ! \
appsink name=asink emit-signals=true max-buffers={MIC_APPSINK_MAX_BUFFERS} drop=true"
)
}
}
fn microphone_noise_suppression_stage(enabled: bool) -> String {
if enabled && gst::ElementFactory::find("webrtcdsp").is_some() {
format!(
"webrtcdsp echo-cancel=false noise-suppression=true noise-suppression-level={} high-pass-filter=true gain-control=false limiter=true ! ",
mic_noise_suppression_level()
)
} else {
String::new()
}
}
fn mic_noise_suppression_level() -> &'static str {
std::env::var(MIC_NOISE_SUPPRESSION_LEVEL_ENV)
.ok()
.and_then(|raw| match raw.trim().to_ascii_lowercase().as_str() {
"low" => Some("low"),
"moderate" | "medium" => Some("moderate"),
"high" => Some("high"),
"very-high" | "very_high" | "veryhigh" | "aggressive" => Some("very-high"),
_ => None,
})
.unwrap_or("very-high")
}
fn buffer_duration_us(buf: &gst::BufferRef, bytes: usize) -> u64 {
let payload_duration_us = pcm_payload_duration_us(bytes);
buf.duration()
.map(|ts| ts.nseconds() / 1_000)
.filter(|duration_us| duration_matches_pcm_payload(*duration_us, payload_duration_us))
.unwrap_or(payload_duration_us)
.max(1)
}
fn pcm_payload_duration_us(bytes: usize) -> u64 {
let bytes_per_frame = MIC_CHANNELS * MIC_SAMPLE_BYTES;
let frames = bytes / bytes_per_frame.max(1);
((frames as u128 * 1_000_000u128) / MIC_SAMPLE_RATE as u128).min(u64::MAX as u128) as u64
}
#[cfg(not(coverage))]
/// Keeps `log_microphone_lag_clamped_source` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
/// Inputs are the typed parameters; output is the return value or side effect.
fn log_microphone_lag_clamped_source(
timing: crate::live_capture_clock::RebasedSourcePts,
bytes: usize,
) {
static MIC_LAG_CLAMPED_PACKETS: AtomicU64 = AtomicU64::new(0);
let packet_index = MIC_LAG_CLAMPED_PACKETS.fetch_add(1, Ordering::Relaxed);
if packet_index < 10 || packet_index.is_multiple_of(300) {
warn!(
packet_index,
bytes,
source_pts_us = timing.source_pts_us.unwrap_or_default(),
capture_now_us = timing.capture_now_us,
packet_pts_us = timing.packet_pts_us,
"🎤 clamped laggy microphone source timestamp before bundled uplink"
);
}
}
#[cfg(coverage)]
fn log_microphone_lag_clamped_source(
_timing: crate::live_capture_clock::RebasedSourcePts,
_bytes: usize,
) {
}
/// Keeps `split_audio_sample` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
/// Inputs are the typed parameters; output is the return value or side effect.
fn split_audio_sample(base_pts_us: u64, data: &[u8], target_bytes: usize) -> VecDeque<AudioPacket> {
let frame_bytes = (MIC_CHANNELS * MIC_SAMPLE_BYTES).max(1);
let target_bytes = frame_aligned_packet_bytes(target_bytes.max(frame_bytes));
let mut packets = VecDeque::new();
let mut offset = 0usize;
while offset < data.len() {
let remaining = data.len() - offset;
let mut take = remaining.min(target_bytes);
if remaining > take {
take -= take % frame_bytes;
if take == 0 {
take = frame_bytes.min(remaining);
}
}
let end = offset.saturating_add(take).min(data.len());
if end == offset {
break;
}
let duration_us = pcm_payload_duration_us(take);
let mut packet = AudioPacket {
id: 0,
pts: base_pts_us.saturating_add(pcm_payload_duration_us(offset)),
data: data[offset..end].to_vec(),
frame_duration_us: duration_us.min(u64::from(u32::MAX)) as u32,
..Default::default()
};
audio_transport::mark_packet_pcm_s16le(&mut packet);
packets.push_back(packet);
offset = end;
}
packets
}
fn mic_packet_target_bytes() -> usize {
let frame_bytes = MIC_CHANNELS * MIC_SAMPLE_BYTES;
let target_us = mic_packet_target_duration_us().clamp(1_000, 100_000);
let frames = ((MIC_SAMPLE_RATE as u128 * target_us as u128) / 1_000_000u128)
.max(1)
.min(usize::MAX as u128) as usize;
frame_aligned_packet_bytes(frames.saturating_mul(frame_bytes))
}
fn frame_aligned_packet_bytes(bytes: usize) -> usize {
let frame_bytes = (MIC_CHANNELS * MIC_SAMPLE_BYTES).max(1);
((bytes / frame_bytes).max(1)).saturating_mul(frame_bytes)
}
/// Rejects bogus capture timestamps before they can poison mic PTS rebasing.
fn duration_matches_pcm_payload(reported_us: u64, payload_us: u64) -> bool {
if reported_us == 0 {
return false;
}
if payload_us <= 1 {
return true;
}
let lower = (payload_us / 8).max(1);
let upper = payload_us.saturating_mul(8);
reported_us >= lower && reported_us <= upper
}
fn mic_pulse_buffer_time_us() -> u64 {
positive_u64_env(MIC_PULSE_BUFFER_TIME_ENV, DEFAULT_MIC_PULSE_BUFFER_TIME_US)
}
fn mic_pulse_latency_time_us() -> u64 {
positive_u64_env(
MIC_PULSE_LATENCY_TIME_ENV,
DEFAULT_MIC_PULSE_LATENCY_TIME_US,
)
}
fn mic_packet_target_duration_us() -> u64 {
positive_u64_env(
MIC_PACKET_TARGET_DURATION_ENV,
DEFAULT_MIC_PACKET_TARGET_DURATION_US,
)
}
fn positive_u64_env(name: &str, default_value: u64) -> u64 {
std::env::var(name)
.ok()
.and_then(|value| value.trim().parse::<u64>().ok())
.filter(|value| *value > 0)
.unwrap_or(default_value)
}
fn explicit_media_sources_required() -> bool {
bool_env_enabled(REQUIRE_EXPLICIT_MEDIA_SOURCES_ENV)
}
fn mic_noise_suppression_from_env() -> bool {
bool_env_enabled(MIC_NOISE_SUPPRESSION_ENV)
}
fn bool_env_enabled(name: &str) -> bool {
std::env::var(name).ok().is_some_and(|value| {
let value = value.trim();
value == "1"
|| value.eq_ignore_ascii_case("true")
|| value.eq_ignore_ascii_case("yes")
|| value.eq_ignore_ascii_case("on")
})
}
/// Detect launcher catalog names that should be opened through Pulse directly.
fn looks_like_pulse_source_name(source: &str) -> bool {
let source = source.trim();
source.starts_with("alsa_input.")
|| source.starts_with("bluez_input.")
|| source.starts_with("input.")
}
fn mic_gain_from_env() -> f64 {
std::env::var(MIC_GAIN_ENV)
.ok()
.and_then(|raw| parse_mic_gain(&raw))
.unwrap_or(1.0)
}
/// Keeps `parse_mic_gain` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
/// Inputs are the typed parameters; output is the return value or side effect.
fn parse_mic_gain(raw: &str) -> Option<f64> {
let value = raw.split_ascii_whitespace().next()?.parse::<f64>().ok()?;
value.is_finite().then_some(clamp_mic_gain(value))
}
fn clamp_mic_gain(value: f64) -> f64 {
value.clamp(0.0, 4.0)
}
fn format_mic_gain_for_gst(gain: f64) -> String {
format!("{:.3}", clamp_mic_gain(gain))
}
/// Keeps `maybe_spawn_mic_gain_control` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
/// Inputs are the typed parameters; output is the return value or side effect.
fn maybe_spawn_mic_gain_control(volume: gst::Element) {
let Ok(path) = std::env::var(MIC_GAIN_CONTROL_ENV) else {
return;
};
let path = std::path::PathBuf::from(path);
thread::spawn(move || {
let mut last_gain = None;
loop {
if let Some(gain) = read_mic_gain_control(&path)
&& last_gain != Some(gain)
{
volume.set_property("volume", gain);
last_gain = Some(gain);
tracing::info!("🎤 mic gain set to {gain:.2}x");
}
thread::sleep(Duration::from_millis(100));
}
});
}
/// Keeps `spawn_mic_level_tap` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
/// Inputs are the typed parameters; output is the return value or side effect.
fn spawn_mic_level_tap(sink: gst_app::AppSink, path: PathBuf) -> Arc<AtomicBool> {
let running = Arc::new(AtomicBool::new(true));
let thread_running = Arc::clone(&running);
thread::spawn(move || {
while thread_running.load(AtomicOrdering::Acquire) {
if let Some(sample) = sink.try_pull_sample(gst::ClockTime::from_mseconds(250))
&& let Some(buffer) = sample.buffer()
&& let Ok(map) = buffer.map_readable()
{
let level = pcm_peak_fraction(map.as_slice());
if let Err(err) = write_mic_level_tap(&path, level) {
tracing::debug!("🎤 local uplink level tap write failed: {err:#}");
}
}
}
});
running
}
fn pcm_peak_fraction(bytes: &[u8]) -> f64 {
let peak = bytes
.chunks_exact(2)
.map(|chunk| i16::from_le_bytes([chunk[0], chunk[1]]).unsigned_abs() as f64)
.fold(0.0, f64::max);
(peak / i16::MAX as f64).clamp(0.0, 1.0)
}
fn write_mic_level_tap(path: &StdPath, level: f64) -> Result<()> {
let tmp_path = path.with_extension("tmp");
std::fs::write(&tmp_path, format!("{level:.6}\n"))
.with_context(|| format!("writing {}", tmp_path.display()))?;
std::fs::rename(&tmp_path, path).with_context(|| format!("publishing {}", path.display()))?;
Ok(())
}
fn read_mic_gain_control(path: &StdPath) -> Option<f64> {
std::fs::read_to_string(path)
.ok()
.and_then(|raw| parse_mic_gain(&raw))
}
impl Drop for MicrophoneCapture {
/// Keeps `drop` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
/// Inputs are the typed parameters; output is the return value or side effect.
fn drop(&mut self) {
if let Some(running) = &self.level_tap_running {
running.store(false, AtomicOrdering::Release);
}
let _ = self.pipeline.set_state(gst::State::Null);
}
}
#[cfg(test)]
#[path = "microphone/tests/mod.rs"]
mod tests;