gui: Add terminator packet and 200ms voice hold for VAD
Implements proper voice activity detection with: - 200ms hold period after audio drops below threshold to prevent choppy cutoffs - Terminator packet (end_bit=true) when speech ends to signal stream completion - TransmitState enum to track transmission state across frames This ensures other Mumble clients receive proper end-of-speech signaling for clean audio termination and correct "talking" indicator behavior. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
+48
-6
@@ -11,7 +11,20 @@ use crate::imp;
|
||||
|
||||
static DF_MODEL: Asset = asset!("/assets/DeepFilterNet3_ll_onnx.tar.gz");
|
||||
// TODO: make this user configurable.
|
||||
static DEFAULT_NOISE_FLOOR: f32 = 0.001;
|
||||
static DEFAULT_NOISE_FLOOR: f32 = 0.0007;
|
||||
// 200ms hold at 48kHz sample rate
|
||||
static HOLD_SAMPLES_MAX: usize = 48000 / 5; // 9600 samples = 200ms
|
||||
|
||||
/// Indicates the transmission state after processing audio.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TransmitState {
|
||||
/// Audio is above threshold, or below but within hold period - transmit normally
|
||||
Transmitting,
|
||||
/// Hold period expired - send this frame as terminator (end_bit = true)
|
||||
Terminator,
|
||||
/// Silent and not transmitting - don't send anything
|
||||
Silent,
|
||||
}
|
||||
|
||||
enum DenoisingModelState {
|
||||
Nothing,
|
||||
@@ -79,6 +92,10 @@ pub struct AudioProcessor {
|
||||
spawn: imp::SpawnHandle,
|
||||
buffer: Vec<f32>,
|
||||
noise_floor: f32,
|
||||
/// Whether we were transmitting in the previous frame
|
||||
was_transmitting: bool,
|
||||
/// Number of samples we've been below threshold (for hold period)
|
||||
hold_samples: usize,
|
||||
}
|
||||
|
||||
impl AudioProcessor {
|
||||
@@ -88,6 +105,8 @@ impl AudioProcessor {
|
||||
spawn: imp::SpawnHandle::current(),
|
||||
buffer: Vec::new(),
|
||||
noise_floor: DEFAULT_NOISE_FLOOR,
|
||||
was_transmitting: false,
|
||||
hold_samples: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,12 +116,14 @@ impl AudioProcessor {
|
||||
spawn: imp::SpawnHandle::current(),
|
||||
buffer: Vec::new(),
|
||||
noise_floor: DEFAULT_NOISE_FLOOR,
|
||||
was_transmitting: false,
|
||||
hold_samples: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AudioProcessor {
|
||||
pub fn process(&mut self, audio: &[f32], channels: usize, output: &mut Vec<f32>) {
|
||||
pub fn process(&mut self, audio: &[f32], channels: usize, output: &mut Vec<f32>) -> TransmitState {
|
||||
let mut include_raw = true;
|
||||
if self.denoise {
|
||||
with_denoising_model(&self.spawn, |df| {
|
||||
@@ -138,15 +159,36 @@ impl AudioProcessor {
|
||||
output.extend(audio.iter().step_by(channels).copied());
|
||||
}
|
||||
|
||||
// Adds threshoulding to prevent sending audio when things are really quiet.
|
||||
// Calculate average amplitude for VAD
|
||||
let avg: f32 = if output.is_empty() {
|
||||
0.0
|
||||
} else {
|
||||
output.iter().map(|x| x.abs()).sum::<f32>() / output.len() as f32
|
||||
};
|
||||
if avg < self.noise_floor {
|
||||
output.clear();
|
||||
}
|
||||
|
||||
let above_threshold = avg >= self.noise_floor;
|
||||
let samples_in_frame = output.len();
|
||||
|
||||
let state = if above_threshold {
|
||||
// Above threshold - reset hold counter and transmit
|
||||
self.hold_samples = 0;
|
||||
self.was_transmitting = true;
|
||||
TransmitState::Transmitting
|
||||
} else if self.was_transmitting && self.hold_samples < HOLD_SAMPLES_MAX {
|
||||
// Below threshold but in hold period - keep transmitting
|
||||
self.hold_samples += samples_in_frame;
|
||||
TransmitState::Transmitting
|
||||
} else if self.was_transmitting {
|
||||
// Hold period expired - send terminator
|
||||
self.was_transmitting = false;
|
||||
self.hold_samples = 0;
|
||||
TransmitState::Terminator
|
||||
} else {
|
||||
// Not transmitting and below threshold - stay silent
|
||||
TransmitState::Silent
|
||||
};
|
||||
|
||||
state
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+29
-16
@@ -1,4 +1,4 @@
|
||||
use crate::effects::{AudioProcessor, AudioProcessorSender};
|
||||
use crate::effects::{AudioProcessor, AudioProcessorSender, TransmitState};
|
||||
use color_eyre::eyre::{eyre, Error};
|
||||
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait as _};
|
||||
use futures::io::{AsyncRead, AsyncWrite};
|
||||
@@ -23,6 +23,31 @@ pub struct AudioSystem {
|
||||
const SAMPLE_RATE: u32 = 48_000;
|
||||
const PACKET_SAMPLES: u32 = 960;
|
||||
|
||||
fn encode_and_send(
|
||||
state: TransmitState,
|
||||
output_buffer: &mut Vec<f32>,
|
||||
encoder: &mut opus::Encoder,
|
||||
each: &mut impl FnMut(Vec<u8>, bool),
|
||||
) {
|
||||
let (is_terminator, should_encode) = match state {
|
||||
TransmitState::Silent => return,
|
||||
TransmitState::Transmitting => (false, output_buffer.len() >= PACKET_SAMPLES as usize),
|
||||
TransmitState::Terminator => {
|
||||
output_buffer.resize(PACKET_SAMPLES as usize, 0.0);
|
||||
(true, true)
|
||||
}
|
||||
};
|
||||
|
||||
if should_encode {
|
||||
let remainder = output_buffer.split_off(PACKET_SAMPLES as usize);
|
||||
let frame = replace(output_buffer, remainder);
|
||||
match encoder.encode_vec_float(&frame, frame.len() * 2) {
|
||||
Ok(encoded) => each(encoded, is_terminator),
|
||||
Err(e) => error!("error encoding {} samples: {e:?}", frame.len()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type Buffer = Arc<Mutex<dasp_ring_buffer::Bounded<Vec<i16>>>>;
|
||||
|
||||
impl AudioSystem {
|
||||
@@ -79,7 +104,7 @@ impl AudioSystem {
|
||||
|
||||
pub fn start_recording(
|
||||
&mut self,
|
||||
mut each: impl FnMut(Vec<u8>) + Send + 'static,
|
||||
mut each: impl FnMut(Vec<u8>, bool) + Send + 'static,
|
||||
) -> Result<(), Error> {
|
||||
let config = self.choose_config(self.input.supported_input_configs()?)?;
|
||||
info!(
|
||||
@@ -97,20 +122,8 @@ impl AudioSystem {
|
||||
if let Some(new_processor) = processors.take() {
|
||||
current_processor = new_processor;
|
||||
}
|
||||
current_processor.process(frame, config.channels as usize, &mut output_buffer);
|
||||
if output_buffer.len() < PACKET_SAMPLES as usize {
|
||||
return;
|
||||
}
|
||||
let remainder = output_buffer.split_off(PACKET_SAMPLES as usize);
|
||||
let frame = replace(&mut output_buffer, remainder);
|
||||
match encoder.encode_vec_float(&frame, frame.len() * 2) {
|
||||
Ok(buf) => {
|
||||
each(buf);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("error encoding {} samples: {e:?}", frame.len());
|
||||
}
|
||||
}
|
||||
let state = current_processor.process(frame, config.channels as usize, &mut output_buffer);
|
||||
encode_and_send(state, &mut output_buffer, &mut encoder, &mut each);
|
||||
};
|
||||
|
||||
match self
|
||||
|
||||
+52
-17
@@ -1,7 +1,9 @@
|
||||
use crate::app::Command;
|
||||
use crate::effects::{AudioProcessor, AudioProcessorSender};
|
||||
use crate::effects::{AudioProcessor, AudioProcessorSender, TransmitState};
|
||||
use color_eyre::eyre::{bail, eyre, Error};
|
||||
use crossbeam::atomic::AtomicCell;
|
||||
use dioxus::prelude::*;
|
||||
use std::sync::Arc;
|
||||
use futures::{AsyncRead, AsyncWrite};
|
||||
use gloo_timers::future::TimeoutFuture;
|
||||
use js_sys::Float32Array;
|
||||
@@ -118,7 +120,7 @@ impl AudioSystem {
|
||||
self.processors.store(Some(processor))
|
||||
}
|
||||
|
||||
pub fn start_recording(&mut self, each: impl FnMut(Vec<u8>) + 'static) -> Result<(), Error> {
|
||||
pub fn start_recording(&mut self, each: impl FnMut(Vec<u8>, bool) + 'static) -> Result<(), Error> {
|
||||
let audio_context_worklet = self.webctx.clone();
|
||||
let processors = self.processors.clone();
|
||||
spawn(async move {
|
||||
@@ -222,22 +224,24 @@ impl PromiseExt for Promise {
|
||||
}
|
||||
}
|
||||
|
||||
fn process_audio(frame: &JsValue, processor: &mut AudioProcessor) {
|
||||
fn process_audio(frame: &JsValue, processor: &mut AudioProcessor) -> TransmitState {
|
||||
let Ok(samples) = Reflect::get(&frame, &"data".into()) else {
|
||||
return;
|
||||
return TransmitState::Silent;
|
||||
};
|
||||
let Ok(samples) = samples.dyn_into::<Float32Array>() else {
|
||||
return;
|
||||
return TransmitState::Silent;
|
||||
};
|
||||
let input = samples.to_vec();
|
||||
let mut output = Vec::with_capacity(input.len());
|
||||
processor.process(&input, 1, &mut output);
|
||||
let state = processor.process(&input, 1, &mut output);
|
||||
samples.copy_from(&output);
|
||||
|
||||
state
|
||||
}
|
||||
|
||||
async fn run_encoder_worklet(
|
||||
audio_context: &AudioContext,
|
||||
mut each: impl FnMut(Vec<u8>) + 'static,
|
||||
mut each: impl FnMut(Vec<u8>, bool) + 'static,
|
||||
processors: AudioProcessorSender,
|
||||
) -> Result<AudioWorkletNode, Error> {
|
||||
let constraints = MediaStreamConstraints::new();
|
||||
@@ -262,12 +266,19 @@ async fn run_encoder_worklet(
|
||||
let encoder_error: Closure<dyn FnMut(JsValue)> =
|
||||
Closure::new(|e| error!("error encoding audio {:?}", e));
|
||||
|
||||
// Shared state to signal terminator between onmessage and output closures
|
||||
// The output closure runs asynchronously after encoding completes
|
||||
let pending_terminator = Arc::new(AtomicCell::new(false));
|
||||
let pending_terminator_output = pending_terminator.clone();
|
||||
|
||||
// This knows what MediaStreamTrackGenerator to use as it closes around it
|
||||
let output: Closure<dyn FnMut(EncodedAudioChunk)> =
|
||||
Closure::new(move |audio_data: EncodedAudioChunk| {
|
||||
let mut array = vec![0u8; audio_data.byte_length() as usize];
|
||||
audio_data.copy_to_with_u8_slice(&mut array);
|
||||
each(array);
|
||||
// Check if this frame was marked as a terminator
|
||||
let is_terminator = pending_terminator_output.swap(false);
|
||||
each(array, is_terminator);
|
||||
});
|
||||
|
||||
let audio_encoder = AudioEncoder::new(&AudioEncoderInit::new(
|
||||
@@ -294,17 +305,41 @@ async fn run_encoder_worklet(
|
||||
}
|
||||
|
||||
let frame = event.data();
|
||||
process_audio(&frame, &mut current_processor);
|
||||
let state = process_audio(&frame, &mut current_processor);
|
||||
|
||||
match AudioData::new(frame.unchecked_ref()) {
|
||||
Ok(data) => {
|
||||
let _ = audio_encoder.encode(&data);
|
||||
match state {
|
||||
TransmitState::Silent => {
|
||||
// Don't encode or send anything
|
||||
return;
|
||||
}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"error creating AudioData object {:?} during event {:?}",
|
||||
err, event,
|
||||
);
|
||||
TransmitState::Transmitting => {
|
||||
// Normal transmission
|
||||
match AudioData::new(frame.unchecked_ref()) {
|
||||
Ok(data) => {
|
||||
let _ = audio_encoder.encode(&data);
|
||||
}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"error creating AudioData object {:?} during event {:?}",
|
||||
err, event,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
TransmitState::Terminator => {
|
||||
// Mark this as a terminator before encoding
|
||||
pending_terminator.store(true);
|
||||
match AudioData::new(frame.unchecked_ref()) {
|
||||
Ok(data) => {
|
||||
let _ = audio_encoder.encode(&data);
|
||||
}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"error creating AudioData object {:?} during event {:?}",
|
||||
err, event,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
+2
-2
@@ -114,14 +114,14 @@ pub async fn network_loop<R: imp::ImpRead, W: imp::ImpWrite>(
|
||||
{
|
||||
let send_chan = send_chan.clone();
|
||||
let mut sequence_num = 0;
|
||||
audio.start_recording(move |opus_frame| {
|
||||
audio.start_recording(move |opus_frame, is_terminator| {
|
||||
let _ =
|
||||
send_chan.unbounded_send(ControlPacket::UDPTunnel(Box::new(VoicePacket::Audio {
|
||||
_dst: std::marker::PhantomData,
|
||||
target: 0,
|
||||
session_id: (),
|
||||
seq_num: sequence_num,
|
||||
payload: VoicePacketPayload::Opus(opus_frame.into(), false),
|
||||
payload: VoicePacketPayload::Opus(opus_frame.into(), is_terminator),
|
||||
position_info: None,
|
||||
})));
|
||||
sequence_num = sequence_num.wrapping_add(2);
|
||||
|
||||
Reference in New Issue
Block a user