Files
mumble-web2/gui/src/effects.rs
T
liamwarfield 3ddf892169 gui: Add terminator packet and 200ms voice hold for VAD
Implements proper voice activity detection with:
- 200ms hold period after audio drops below threshold to prevent choppy cutoffs
- Terminator packet (end_bit=true) when speech ends to signal stream completion
- TransmitState enum to track transmission state across frames

This ensures other Mumble clients receive proper end-of-speech signaling
for clean audio termination and correct "talking" indicator behavior.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-19 14:57:12 -07:00

196 lines
6.6 KiB
Rust

use crossbeam::atomic::AtomicCell;
use df::tract::{mut_slice_as_arrayviewmut, slice_as_arrayview};
use df::tract::{DfParams, DfTract, RuntimeParams};
use dioxus::prelude::{asset, manganis, Asset};
use dioxus_asset_resolver::read_asset_bytes;
use std::cell::RefCell;
use std::sync::Arc;
use tracing::{error, info};
use crate::imp;
static DF_MODEL: Asset = asset!("/assets/DeepFilterNet3_ll_onnx.tar.gz");
// TODO: make this user configurable.
static DEFAULT_NOISE_FLOOR: f32 = 0.0007;
// 200ms hold at 48kHz sample rate
static HOLD_SAMPLES_MAX: usize = 48000 / 5; // 9600 samples = 200ms
/// Indicates the transmission state after processing audio.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TransmitState {
/// Audio is above threshold, or below but within hold period - transmit normally
Transmitting,
/// Hold period expired - send this frame as terminator (end_bit = true)
Terminator,
/// Silent and not transmitting - don't send anything
Silent,
}
enum DenoisingModelState {
Nothing,
Downloading(Arc<AtomicCell<Option<DfParams>>>),
Availible(Box<DfTract>),
}
fn with_denoising_model<O>(
spawn: &imp::SpawnHandle,
func: impl FnOnce(&mut DfTract) -> O,
) -> Option<O> {
// Using a thread local is super gross, but DfTract is not Send (so it can never leave the current
// thread) while AudioProcessing itself might change threads whenever.
thread_local! {
static STATE: RefCell<DenoisingModelState> = const { RefCell::new(DenoisingModelState::Nothing) };
}
STATE.with_borrow_mut(|state| match state {
DenoisingModelState::Nothing => {
let cell = Arc::new(AtomicCell::new(None));
let cell_task = cell.clone();
*state = DenoisingModelState::Downloading(cell);
spawn.spawn(async move {
let model_bytes = match read_asset_bytes(&DF_MODEL).await {
Ok(b) => b,
Err(e) => {
error!("could not read denoising model from \"{DF_MODEL}\": {e:?}");
return;
}
};
let params = match DfParams::from_bytes(&model_bytes) {
Ok(p) => p,
Err(e) => {
error!("could not load denoising model parameters: {e:?}");
return;
}
};
cell_task.store(Some(params));
});
None
}
DenoisingModelState::Downloading(cell) => {
if let Some(params) = cell.take() {
let mut tract = match DfTract::new(params, &RuntimeParams::default_with_ch(1)) {
Ok(t) => Box::new(t),
Err(e) => {
error!("could not create denoising engine: {e:?}");
return None;
}
};
info!("instantiated denoising engine");
let out = func(&mut tract);
*state = DenoisingModelState::Availible(tract);
Some(out)
} else {
None
}
}
DenoisingModelState::Availible(tract) => Some(func(tract)),
})
}
pub struct AudioProcessor {
denoise: bool,
spawn: imp::SpawnHandle,
buffer: Vec<f32>,
noise_floor: f32,
/// Whether we were transmitting in the previous frame
was_transmitting: bool,
/// Number of samples we've been below threshold (for hold period)
hold_samples: usize,
}
impl AudioProcessor {
pub fn new_plain() -> Self {
AudioProcessor {
denoise: false,
spawn: imp::SpawnHandle::current(),
buffer: Vec::new(),
noise_floor: DEFAULT_NOISE_FLOOR,
was_transmitting: false,
hold_samples: 0,
}
}
pub fn new_denoising() -> Self {
AudioProcessor {
denoise: true,
spawn: imp::SpawnHandle::current(),
buffer: Vec::new(),
noise_floor: DEFAULT_NOISE_FLOOR,
was_transmitting: false,
hold_samples: 0,
}
}
}
impl AudioProcessor {
pub fn process(&mut self, audio: &[f32], channels: usize, output: &mut Vec<f32>) -> TransmitState {
let mut include_raw = true;
if self.denoise {
with_denoising_model(&self.spawn, |df| {
include_raw = false;
self.buffer.extend(audio.iter().step_by(channels).copied());
output.reserve(audio.len());
let hop = df.hop_size;
let mut i = 0;
while self.buffer[i..].len() >= hop {
let audio = &self.buffer[i..][..hop];
i += audio.len();
let j = output.len();
output.extend(std::iter::repeat_n(0f32, audio.len()));
let output = &mut output[j..];
df.process(
slice_as_arrayview(audio, &[audio.len()])
.into_shape((1, audio.len()))
.unwrap(),
mut_slice_as_arrayviewmut(output, &[output.len()])
.into_shape((1, output.len()))
.unwrap(),
);
}
self.buffer.splice(..i, []);
});
}
if include_raw {
output.extend(audio.iter().step_by(channels).copied());
}
// Calculate average amplitude for VAD
let avg: f32 = if output.is_empty() {
0.0
} else {
output.iter().map(|x| x.abs()).sum::<f32>() / output.len() as f32
};
let above_threshold = avg >= self.noise_floor;
let samples_in_frame = output.len();
let state = if above_threshold {
// Above threshold - reset hold counter and transmit
self.hold_samples = 0;
self.was_transmitting = true;
TransmitState::Transmitting
} else if self.was_transmitting && self.hold_samples < HOLD_SAMPLES_MAX {
// Below threshold but in hold period - keep transmitting
self.hold_samples += samples_in_frame;
TransmitState::Transmitting
} else if self.was_transmitting {
// Hold period expired - send terminator
self.was_transmitting = false;
self.hold_samples = 0;
TransmitState::Terminator
} else {
// Not transmitting and below threshold - stay silent
TransmitState::Silent
};
state
}
}
pub type AudioProcessorSender = Arc<AtomicCell<Option<AudioProcessor>>>;