lesavka/client/src/input/microphone.rs

// client/src/input/microphone.rs
use anyhow::{Context, Result, bail};
use gst::prelude::*;
use gstreamer as gst;
use gstreamer_app as gst_app;
use lesavka_common::{
    audio_transport::{self, UpstreamAudioCodec},
    lesavka::AudioPacket,
};
use shell_escape::unix::escape;
#[cfg(not(coverage))]
use std::sync::atomic::{AtomicU64, Ordering};
use std::{
    collections::VecDeque,
    path::{Path as StdPath, PathBuf},
    sync::{
        Arc, Mutex,
        atomic::{AtomicBool, Ordering as AtomicOrdering},
    },
    thread,
    time::Duration,
};
use tracing::{debug, info, warn};
#[cfg(not(coverage))]
use tracing::{error, trace};

const MIC_GAIN_ENV: &str = "LESAVKA_MIC_GAIN";
const MIC_GAIN_CONTROL_ENV: &str = "LESAVKA_MIC_GAIN_CONTROL";
const MIC_LEVEL_TAP_ENV: &str = "LESAVKA_UPLINK_MIC_LEVEL";
const MIC_PULSE_BUFFER_TIME_ENV: &str = "LESAVKA_MIC_PULSE_BUFFER_TIME_US";
const MIC_PULSE_LATENCY_TIME_ENV: &str = "LESAVKA_MIC_PULSE_LATENCY_TIME_US";
const MIC_PACKET_TARGET_DURATION_ENV: &str = "LESAVKA_MIC_PACKET_TARGET_US";
const REQUIRE_EXPLICIT_MEDIA_SOURCES_ENV: &str = "LESAVKA_REQUIRE_EXPLICIT_MEDIA_SOURCES";
const MIC_NOISE_SUPPRESSION_ENV: &str = "LESAVKA_MIC_NOISE_SUPPRESSION";
const MIC_NOISE_SUPPRESSION_LEVEL_ENV: &str = "LESAVKA_MIC_NOISE_SUPPRESSION_LEVEL";
const MIC_SAMPLE_RATE: u64 = 48_000;
const MIC_CHANNELS: usize = 2;
const MIC_SAMPLE_BYTES: usize = std::mem::size_of::<i16>();
const DEFAULT_MIC_PULSE_BUFFER_TIME_US: u64 = 40_000;
const DEFAULT_MIC_PULSE_LATENCY_TIME_US: u64 = 10_000;
const DEFAULT_MIC_PACKET_TARGET_DURATION_US: u64 = 20_000;
const MIC_MAIN_QUEUE_MAX_BUFFERS: u32 = 8;
const MIC_MAIN_QUEUE_MAX_TIME_NS: u64 = 80_000_000;
const MIC_APPSINK_MAX_BUFFERS: u32 = 8;

pub struct MicrophoneCapture {
    #[allow(dead_code)] // kept alive to hold PLAYING state
    pipeline: gst::Pipeline,
    sink: gst_app::AppSink,
    level_tap_running: Option<Arc<AtomicBool>>,
    pts_rebaser: crate::live_capture_clock::DurationPacedSourcePtsRebaser,
    pending_packets: Mutex<VecDeque<AudioPacket>>,
    audio_encoder: Mutex<Option<crate::input::audio_codec::OpusPacketEncoder>>,
}

include!("microphone/capture_runtime.rs");
fn mic_level_tap_path() -> Option<PathBuf> {
    std::env::var(MIC_LEVEL_TAP_ENV)
        .ok()
        .map(|value| value.trim().to_string())
        .filter(|value| !value.is_empty())
        .map(PathBuf::from)
}

/// Keeps `microphone_pipeline_desc` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
/// Inputs are the typed parameters; output is the return value or side effect.
fn microphone_pipeline_desc(
    source_desc: &str,
    gain: f64,
    level_tap_enabled: bool,
    noise_suppression: bool,
) -> String {
    let gain = format_mic_gain_for_gst(gain);
    let noise_stage = microphone_noise_suppression_stage(noise_suppression);
    if level_tap_enabled {
        format!(
            "{source_desc} ! \
             audioconvert ! audioresample ! \
             audio/x-raw,format=S16LE,channels={MIC_CHANNELS},rate={MIC_SAMPLE_RATE} ! \
             {noise_stage}\
             volume name=mic_input_gain volume={gain} ! \
             tee name=t \
             t. ! queue max-size-buffers={MIC_MAIN_QUEUE_MAX_BUFFERS} max-size-time={MIC_MAIN_QUEUE_MAX_TIME_NS} leaky=downstream ! \
                  audio/x-raw,format=S16LE,channels={MIC_CHANNELS},rate={MIC_SAMPLE_RATE} ! \
                  appsink name=asink emit-signals=true max-buffers={MIC_APPSINK_MAX_BUFFERS} drop=true \
             t. ! queue max-size-buffers=8 leaky=downstream ! \
                  audio/x-raw,format=S16LE,channels={MIC_CHANNELS},rate={MIC_SAMPLE_RATE} ! \
                  appsink name=level_sink emit-signals=false sync=false max-buffers=8 drop=true"
        )
    } else {
        format!(
            "{source_desc} ! \
             audioconvert ! audioresample ! \
             audio/x-raw,format=S16LE,channels={MIC_CHANNELS},rate={MIC_SAMPLE_RATE} ! \
             {noise_stage}\
             volume name=mic_input_gain volume={gain} ! \
             queue max-size-buffers={MIC_MAIN_QUEUE_MAX_BUFFERS} max-size-time={MIC_MAIN_QUEUE_MAX_TIME_NS} leaky=downstream ! \
             appsink name=asink emit-signals=true max-buffers={MIC_APPSINK_MAX_BUFFERS} drop=true"
        )
    }
}

fn microphone_noise_suppression_stage(enabled: bool) -> String {
    if enabled && gst::ElementFactory::find("webrtcdsp").is_some() {
        format!(
            "webrtcdsp echo-cancel=false noise-suppression=true noise-suppression-level={} high-pass-filter=true gain-control=false limiter=true ! ",
            mic_noise_suppression_level()
        )
    } else {
        String::new()
    }
}

fn mic_noise_suppression_level() -> &'static str {
    std::env::var(MIC_NOISE_SUPPRESSION_LEVEL_ENV)
        .ok()
        .and_then(|raw| match raw.trim().to_ascii_lowercase().as_str() {
            "low" => Some("low"),
            "moderate" | "medium" => Some("moderate"),
            "high" => Some("high"),
            "very-high" | "very_high" | "veryhigh" | "aggressive" => Some("very-high"),
            _ => None,
        })
        .unwrap_or("very-high")
}

fn buffer_duration_us(buf: &gst::BufferRef, bytes: usize) -> u64 {
    let payload_duration_us = pcm_payload_duration_us(bytes);
    buf.duration()
        .map(|ts| ts.nseconds() / 1_000)
        .filter(|duration_us| duration_matches_pcm_payload(*duration_us, payload_duration_us))
        .unwrap_or(payload_duration_us)
        .max(1)
}

fn pcm_payload_duration_us(bytes: usize) -> u64 {
    let bytes_per_frame = MIC_CHANNELS * MIC_SAMPLE_BYTES;
    let frames = bytes / bytes_per_frame.max(1);
    ((frames as u128 * 1_000_000u128) / MIC_SAMPLE_RATE as u128).min(u64::MAX as u128) as u64
}

#[cfg(not(coverage))]
/// Keeps `log_microphone_lag_clamped_source` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
/// Inputs are the typed parameters; output is the return value or side effect.
fn log_microphone_lag_clamped_source(
    timing: crate::live_capture_clock::RebasedSourcePts,
    bytes: usize,
) {
    static MIC_LAG_CLAMPED_PACKETS: AtomicU64 = AtomicU64::new(0);
    let packet_index = MIC_LAG_CLAMPED_PACKETS.fetch_add(1, Ordering::Relaxed);
    if packet_index < 10 || packet_index.is_multiple_of(300) {
        warn!(
            packet_index,
            bytes,
            source_pts_us = timing.source_pts_us.unwrap_or_default(),
            capture_now_us = timing.capture_now_us,
            packet_pts_us = timing.packet_pts_us,
            "🎤 clamped laggy microphone source timestamp before bundled uplink"
        );
    }
}

#[cfg(coverage)]
fn log_microphone_lag_clamped_source(
    _timing: crate::live_capture_clock::RebasedSourcePts,
    _bytes: usize,
) {
}

/// Keeps `split_audio_sample` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
/// Inputs are the typed parameters; output is the return value or side effect.
fn split_audio_sample(base_pts_us: u64, data: &[u8], target_bytes: usize) -> VecDeque<AudioPacket> {
    let frame_bytes = (MIC_CHANNELS * MIC_SAMPLE_BYTES).max(1);
    let target_bytes = frame_aligned_packet_bytes(target_bytes.max(frame_bytes));
    let mut packets = VecDeque::new();
    let mut offset = 0usize;
    while offset < data.len() {
        let remaining = data.len() - offset;
        let mut take = remaining.min(target_bytes);
        if remaining > take {
            take -= take % frame_bytes;
            if take == 0 {
                take = frame_bytes.min(remaining);
            }
        }
        let end = offset.saturating_add(take).min(data.len());
        if end == offset {
            break;
        }
        let duration_us = pcm_payload_duration_us(take);
        let mut packet = AudioPacket {
            id: 0,
            pts: base_pts_us.saturating_add(pcm_payload_duration_us(offset)),
            data: data[offset..end].to_vec(),
            frame_duration_us: duration_us.min(u64::from(u32::MAX)) as u32,
            ..Default::default()
        };
        audio_transport::mark_packet_pcm_s16le(&mut packet);
        packets.push_back(packet);
        offset = end;
    }
    packets
}

fn mic_packet_target_bytes() -> usize {
    let frame_bytes = MIC_CHANNELS * MIC_SAMPLE_BYTES;
    let target_us = mic_packet_target_duration_us().clamp(1_000, 100_000);
    let frames = ((MIC_SAMPLE_RATE as u128 * target_us as u128) / 1_000_000u128)
        .max(1)
        .min(usize::MAX as u128) as usize;
    frame_aligned_packet_bytes(frames.saturating_mul(frame_bytes))
}

fn frame_aligned_packet_bytes(bytes: usize) -> usize {
    let frame_bytes = (MIC_CHANNELS * MIC_SAMPLE_BYTES).max(1);
    ((bytes / frame_bytes).max(1)).saturating_mul(frame_bytes)
}

/// Rejects bogus capture timestamps before they can poison mic PTS rebasing.
fn duration_matches_pcm_payload(reported_us: u64, payload_us: u64) -> bool {
    if reported_us == 0 {
        return false;
    }
    if payload_us <= 1 {
        return true;
    }
    let lower = (payload_us / 8).max(1);
    let upper = payload_us.saturating_mul(8);
    reported_us >= lower && reported_us <= upper
}

fn mic_pulse_buffer_time_us() -> u64 {
    positive_u64_env(MIC_PULSE_BUFFER_TIME_ENV, DEFAULT_MIC_PULSE_BUFFER_TIME_US)
}

fn mic_pulse_latency_time_us() -> u64 {
    positive_u64_env(
        MIC_PULSE_LATENCY_TIME_ENV,
        DEFAULT_MIC_PULSE_LATENCY_TIME_US,
    )
}

fn mic_packet_target_duration_us() -> u64 {
    positive_u64_env(
        MIC_PACKET_TARGET_DURATION_ENV,
        DEFAULT_MIC_PACKET_TARGET_DURATION_US,
    )
}

fn positive_u64_env(name: &str, default_value: u64) -> u64 {
    std::env::var(name)
        .ok()
        .and_then(|value| value.trim().parse::<u64>().ok())
        .filter(|value| *value > 0)
        .unwrap_or(default_value)
}

fn explicit_media_sources_required() -> bool {
    bool_env_enabled(REQUIRE_EXPLICIT_MEDIA_SOURCES_ENV)
}

fn mic_noise_suppression_from_env() -> bool {
    bool_env_enabled(MIC_NOISE_SUPPRESSION_ENV)
}

fn bool_env_enabled(name: &str) -> bool {
    std::env::var(name).ok().is_some_and(|value| {
        let value = value.trim();
        value == "1"
            || value.eq_ignore_ascii_case("true")
            || value.eq_ignore_ascii_case("yes")
            || value.eq_ignore_ascii_case("on")
    })
}

/// Detect launcher catalog names that should be opened through Pulse directly.
fn looks_like_pulse_source_name(source: &str) -> bool {
    let source = source.trim();
    source.starts_with("alsa_input.")
        || source.starts_with("bluez_input.")
        || source.starts_with("input.")
}

fn mic_gain_from_env() -> f64 {
    std::env::var(MIC_GAIN_ENV)
        .ok()
        .and_then(|raw| parse_mic_gain(&raw))
        .unwrap_or(1.0)
}

/// Keeps `parse_mic_gain` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
/// Inputs are the typed parameters; output is the return value or side effect.
fn parse_mic_gain(raw: &str) -> Option<f64> {
    let value = raw.split_ascii_whitespace().next()?.parse::<f64>().ok()?;
    value.is_finite().then_some(clamp_mic_gain(value))
}

fn clamp_mic_gain(value: f64) -> f64 {
    value.clamp(0.0, 4.0)
}

fn format_mic_gain_for_gst(gain: f64) -> String {
    format!("{:.3}", clamp_mic_gain(gain))
}

/// Keeps `maybe_spawn_mic_gain_control` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
/// Inputs are the typed parameters; output is the return value or side effect.
fn maybe_spawn_mic_gain_control(volume: gst::Element) {
    let Ok(path) = std::env::var(MIC_GAIN_CONTROL_ENV) else {
        return;
    };
    let path = std::path::PathBuf::from(path);
    thread::spawn(move || {
        let mut last_gain = None;
        loop {
            if let Some(gain) = read_mic_gain_control(&path)
                && last_gain != Some(gain)
            {
                volume.set_property("volume", gain);
                last_gain = Some(gain);
                tracing::info!("🎤 mic gain set to {gain:.2}x");
            }
            thread::sleep(Duration::from_millis(100));
        }
    });
}

/// Keeps `spawn_mic_level_tap` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
/// Inputs are the typed parameters; output is the return value or side effect.
fn spawn_mic_level_tap(sink: gst_app::AppSink, path: PathBuf) -> Arc<AtomicBool> {
    let running = Arc::new(AtomicBool::new(true));
    let thread_running = Arc::clone(&running);
    thread::spawn(move || {
        while thread_running.load(AtomicOrdering::Acquire) {
            if let Some(sample) = sink.try_pull_sample(gst::ClockTime::from_mseconds(250))
                && let Some(buffer) = sample.buffer()
                && let Ok(map) = buffer.map_readable()
            {
                let level = pcm_peak_fraction(map.as_slice());
                if let Err(err) = write_mic_level_tap(&path, level) {
                    tracing::debug!("🎤 local uplink level tap write failed: {err:#}");
                }
            }
        }
    });
    running
}

fn pcm_peak_fraction(bytes: &[u8]) -> f64 {
    let peak = bytes
        .chunks_exact(2)
        .map(|chunk| i16::from_le_bytes([chunk[0], chunk[1]]).unsigned_abs() as f64)
        .fold(0.0, f64::max);
    (peak / i16::MAX as f64).clamp(0.0, 1.0)
}

fn write_mic_level_tap(path: &StdPath, level: f64) -> Result<()> {
    let tmp_path = path.with_extension("tmp");
    std::fs::write(&tmp_path, format!("{level:.6}\n"))
        .with_context(|| format!("writing {}", tmp_path.display()))?;
    std::fs::rename(&tmp_path, path).with_context(|| format!("publishing {}", path.display()))?;
    Ok(())
}

fn read_mic_gain_control(path: &StdPath) -> Option<f64> {
    std::fs::read_to_string(path)
        .ok()
        .and_then(|raw| parse_mic_gain(&raw))
}

impl Drop for MicrophoneCapture {
    /// Keeps `drop` explicit because it sits on microphone capture setup, where host audio stacks expose different source names and latency controls.
    /// Inputs are the typed parameters; output is the return value or side effect.
    fn drop(&mut self) {
        if let Some(running) = &self.level_tap_running {
            running.store(false, AtomicOrdering::Release);
        }
        let _ = self.pipeline.set_state(gst::State::Null);
    }
}

#[cfg(test)]
#[path = "microphone/tests/mod.rs"]
mod tests;