gui: Add terminator packet and 200ms voice hold for VAD
Implements proper voice activity detection with: - 200ms hold period after audio drops below threshold to prevent choppy cutoffs - Terminator packet (end_bit=true) when speech ends to signal stream completion - TransmitState enum to track transmission state across frames This ensures other Mumble clients receive proper end-of-speech signaling for clean audio termination and correct "talking" indicator behavior. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
+48
-6
@@ -11,7 +11,20 @@ use crate::imp;
|
||||
|
||||
static DF_MODEL: Asset = asset!("/assets/DeepFilterNet3_ll_onnx.tar.gz");
|
||||
// TODO: make this user configurable.
|
||||
static DEFAULT_NOISE_FLOOR: f32 = 0.001;
|
||||
static DEFAULT_NOISE_FLOOR: f32 = 0.0007;
|
||||
// 200ms hold at 48kHz sample rate
|
||||
static HOLD_SAMPLES_MAX: usize = 48000 / 5; // 9600 samples = 200ms
|
||||
|
||||
/// Indicates the transmission state after processing audio.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TransmitState {
|
||||
/// Audio is above threshold, or below but within hold period - transmit normally
|
||||
Transmitting,
|
||||
/// Hold period expired - send this frame as terminator (end_bit = true)
|
||||
Terminator,
|
||||
/// Silent and not transmitting - don't send anything
|
||||
Silent,
|
||||
}
|
||||
|
||||
enum DenoisingModelState {
|
||||
Nothing,
|
||||
@@ -79,6 +92,10 @@ pub struct AudioProcessor {
|
||||
spawn: imp::SpawnHandle,
|
||||
buffer: Vec<f32>,
|
||||
noise_floor: f32,
|
||||
/// Whether we were transmitting in the previous frame
|
||||
was_transmitting: bool,
|
||||
/// Number of samples we've been below threshold (for hold period)
|
||||
hold_samples: usize,
|
||||
}
|
||||
|
||||
impl AudioProcessor {
|
||||
@@ -88,6 +105,8 @@ impl AudioProcessor {
|
||||
spawn: imp::SpawnHandle::current(),
|
||||
buffer: Vec::new(),
|
||||
noise_floor: DEFAULT_NOISE_FLOOR,
|
||||
was_transmitting: false,
|
||||
hold_samples: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,12 +116,14 @@ impl AudioProcessor {
|
||||
spawn: imp::SpawnHandle::current(),
|
||||
buffer: Vec::new(),
|
||||
noise_floor: DEFAULT_NOISE_FLOOR,
|
||||
was_transmitting: false,
|
||||
hold_samples: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AudioProcessor {
|
||||
pub fn process(&mut self, audio: &[f32], channels: usize, output: &mut Vec<f32>) {
|
||||
pub fn process(&mut self, audio: &[f32], channels: usize, output: &mut Vec<f32>) -> TransmitState {
|
||||
let mut include_raw = true;
|
||||
if self.denoise {
|
||||
with_denoising_model(&self.spawn, |df| {
|
||||
@@ -138,15 +159,36 @@ impl AudioProcessor {
|
||||
output.extend(audio.iter().step_by(channels).copied());
|
||||
}
|
||||
|
||||
// Adds threshoulding to prevent sending audio when things are really quiet.
|
||||
// Calculate average amplitude for VAD
|
||||
let avg: f32 = if output.is_empty() {
|
||||
0.0
|
||||
} else {
|
||||
output.iter().map(|x| x.abs()).sum::<f32>() / output.len() as f32
|
||||
};
|
||||
if avg < self.noise_floor {
|
||||
output.clear();
|
||||
}
|
||||
|
||||
let above_threshold = avg >= self.noise_floor;
|
||||
let samples_in_frame = output.len();
|
||||
|
||||
let state = if above_threshold {
|
||||
// Above threshold - reset hold counter and transmit
|
||||
self.hold_samples = 0;
|
||||
self.was_transmitting = true;
|
||||
TransmitState::Transmitting
|
||||
} else if self.was_transmitting && self.hold_samples < HOLD_SAMPLES_MAX {
|
||||
// Below threshold but in hold period - keep transmitting
|
||||
self.hold_samples += samples_in_frame;
|
||||
TransmitState::Transmitting
|
||||
} else if self.was_transmitting {
|
||||
// Hold period expired - send terminator
|
||||
self.was_transmitting = false;
|
||||
self.hold_samples = 0;
|
||||
TransmitState::Terminator
|
||||
} else {
|
||||
// Not transmitting and below threshold - stay silent
|
||||
TransmitState::Silent
|
||||
};
|
||||
|
||||
state
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user