Python

Chunk audio with VAD before you transcribe

Feeding long silent audio to a transcriber wastes time and money. Split on speech first using webrtcvad, then send only the chunks that contain voice.

22 Jun 2026

webrtcvad works on 16-bit mono PCM in 10, 20, or 30 ms frames. This walks the stream and yields one segment per run of speech.

import collections
import wave
import webrtcvad
 
 
def read_pcm(path):
    with wave.open(path, "rb") as wf:
        assert wf.getnchannels() == 1, "mono only"
        assert wf.getsampwidth() == 2, "16-bit only"
        rate = wf.getframerate()
        assert rate in (8000, 16000, 32000, 48000)
        return wf.readframes(wf.getnframes()), rate
 
 
def frames(pcm, rate, ms=30):
    n = int(rate * (ms / 1000.0) * 2)  # bytes per frame
    for i in range(0, len(pcm) - n + 1, n):
        yield pcm[i : i + n]
 
 
def voiced_segments(path, aggressiveness=2, ms=30, pad=300):
    vad = webrtcvad.Vad(aggressiveness)
    pcm, rate = read_pcm(path)
    num_padding = pad // ms
    ring = collections.deque(maxlen=num_padding)
    triggered, voiced = False, []
 
    for frame in frames(pcm, rate, ms):
        speech = vad.is_speech(frame, rate)
        if not triggered:
            ring.append((frame, speech))
            if sum(s for _, s in ring) > 0.9 * ring.maxlen:
                triggered = True
                voiced.extend(f for f, _ in ring)
                ring.clear()
        else:
            voiced.append(frame)
            ring.append((frame, speech))
            if sum(not s for _, s in ring) > 0.9 * ring.maxlen:
                triggered = False
                yield b"".join(voiced)
                voiced, ring = [], collections.deque(maxlen=num_padding)
 
    if voiced:
        yield b"".join(voiced)
 
 
for i, chunk in enumerate(voiced_segments("call.wav")):
    print(f"segment {i}: {len(chunk)} bytes")
    # hand chunk to your transcriber here

Gotchas

is_speech is strict about frame size. A 30 ms frame at 16 kHz is exactly 960 bytes (480 samples times 2 bytes), and anything else raises. The padding ring is what stops a single dropped frame from cutting a word in half: it keeps a short trailing buffer so a brief pause does not end the segment.

voice ai local-models