gui: Add terminator packet and 200ms voice hold for VAD

Implements proper voice activity detection with:
- 200ms hold period after audio drops below threshold to prevent choppy cutoffs
- Terminator packet (end_bit=true) when speech ends to signal stream completion
- TransmitState enum to track transmission state across frames

This ensures other Mumble clients receive proper end-of-speech signaling
for clean audio termination and correct "talking" indicator behavior.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-10 18:52:58 -07:00
parent 2cb03208b4
commit 3ddf892169
5 changed files with 253 additions and 444 deletions
+48 -6
View File
@@ -11,7 +11,20 @@ use crate::imp;
static DF_MODEL: Asset = asset!("/assets/DeepFilterNet3_ll_onnx.tar.gz");
// TODO: make this user configurable.
static DEFAULT_NOISE_FLOOR: f32 = 0.001;
static DEFAULT_NOISE_FLOOR: f32 = 0.0007;
// 200ms hold at 48kHz sample rate
static HOLD_SAMPLES_MAX: usize = 48000 / 5; // 9600 samples = 200ms
/// Indicates the transmission state after processing audio.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TransmitState {
/// Audio is above threshold, or below but within hold period - transmit normally
Transmitting,
/// Hold period expired - send this frame as terminator (end_bit = true)
Terminator,
/// Silent and not transmitting - don't send anything
Silent,
}
enum DenoisingModelState {
Nothing,
@@ -79,6 +92,10 @@ pub struct AudioProcessor {
spawn: imp::SpawnHandle,
buffer: Vec<f32>,
noise_floor: f32,
/// Whether we were transmitting in the previous frame
was_transmitting: bool,
/// Number of samples we've been below threshold (for hold period)
hold_samples: usize,
}
impl AudioProcessor {
@@ -88,6 +105,8 @@ impl AudioProcessor {
spawn: imp::SpawnHandle::current(),
buffer: Vec::new(),
noise_floor: DEFAULT_NOISE_FLOOR,
was_transmitting: false,
hold_samples: 0,
}
}
@@ -97,12 +116,14 @@ impl AudioProcessor {
spawn: imp::SpawnHandle::current(),
buffer: Vec::new(),
noise_floor: DEFAULT_NOISE_FLOOR,
was_transmitting: false,
hold_samples: 0,
}
}
}
impl AudioProcessor {
pub fn process(&mut self, audio: &[f32], channels: usize, output: &mut Vec<f32>) {
pub fn process(&mut self, audio: &[f32], channels: usize, output: &mut Vec<f32>) -> TransmitState {
let mut include_raw = true;
if self.denoise {
with_denoising_model(&self.spawn, |df| {
@@ -138,15 +159,36 @@ impl AudioProcessor {
output.extend(audio.iter().step_by(channels).copied());
}
// Adds threshoulding to prevent sending audio when things are really quiet.
// Calculate average amplitude for VAD
let avg: f32 = if output.is_empty() {
0.0
} else {
output.iter().map(|x| x.abs()).sum::<f32>() / output.len() as f32
};
if avg < self.noise_floor {
output.clear();
}
let above_threshold = avg >= self.noise_floor;
let samples_in_frame = output.len();
let state = if above_threshold {
// Above threshold - reset hold counter and transmit
self.hold_samples = 0;
self.was_transmitting = true;
TransmitState::Transmitting
} else if self.was_transmitting && self.hold_samples < HOLD_SAMPLES_MAX {
// Below threshold but in hold period - keep transmitting
self.hold_samples += samples_in_frame;
TransmitState::Transmitting
} else if self.was_transmitting {
// Hold period expired - send terminator
self.was_transmitting = false;
self.hold_samples = 0;
TransmitState::Terminator
} else {
// Not transmitting and below threshold - stay silent
TransmitState::Silent
};
state
}
}