Snippets

Estimate speaker count with the Laplacian eigengap

Diarization needs to guess how many speakers are in a clip. The eigengap of the graph Laplacian gives you k without asking for it up front.

Python

python

import numpy as np
from sklearn.cluster import SpectralClustering


def estimate_speakers(embeddings: np.ndarray, max_k: int = 8):
    # L2-normalize so the dot product is cosine similarity.
    x = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    sim = x @ x.T

    # CLAMP negatives to 0. Do NOT rescale [-1, 1] to [0, 1]:
    # rescaling maps near-orthogonal (different-speaker) pairs at ~0
    # up to 0.5, inflating cross-speaker edges and washing out the
    # eigengap. Clamping leaves different-speaker pairs near 0.
    affinity = np.clip(sim, 0.0, 1.0)
    np.fill_diagonal(affinity, 1.0)

    # Normalized (symmetric) graph Laplacian: L = I - D^-1/2 A D^-1/2.
    deg = affinity.sum(axis=1)
    d_inv_sqrt = 1.0 / np.sqrt(np.maximum(deg, 1e-12))
    lap = np.eye(affinity.shape[0]) - (
        affinity * d_inv_sqrt[:, None] * d_inv_sqrt[None, :]
    )

    eigvals = np.sort(np.linalg.eigvalsh(lap))
    upper = min(max_k, len(eigvals) - 1)
    gaps = np.diff(eigvals[: upper + 1])
    k = int(np.argmax(gaps)) + 1

    if k < 2:
        return 1, np.zeros(affinity.shape[0], dtype=int)

    labels = SpectralClustering(
        n_clusters=k, affinity="precomputed", random_state=0
    ).fit_predict(affinity)
    return k, labels

Chunk audio with VAD before you transcribe

Feeding long silent audio to a transcriber wastes time and money. Split on speech first using webrtcvad, then send only the chunks that contain voice.

Python

python

import collections
import wave
import webrtcvad


def read_pcm(path):
    with wave.open(path, "rb") as wf:
        assert wf.getnchannels() == 1, "mono only"
        assert wf.getsampwidth() == 2, "16-bit only"
        rate = wf.getframerate()
        assert rate in (8000, 16000, 32000, 48000)
        return wf.readframes(wf.getnframes()), rate


def frames(pcm, rate, ms=30):
    n = int(rate * (ms / 1000.0) * 2)  # bytes per frame
    for i in range(0, len(pcm) - n + 1, n):
        yield pcm[i : i + n]


def voiced_segments(path, aggressiveness=2, ms=30, pad=300):
    vad = webrtcvad.Vad(aggressiveness)
    pcm, rate = read_pcm(path)
    num_padding = pad // ms
    ring = collections.deque(maxlen=num_padding)
    triggered, voiced = False, []

    for frame in frames(pcm, rate, ms):
        speech = vad.is_speech(frame, rate)
        if not triggered:
            ring.append((frame, speech))
            if sum(s for _, s in ring) > 0.9 * ring.maxlen:
                triggered = True
                voiced.extend(f for f, _ in ring)
                ring.clear()
        else:
            voiced.append(frame)
            ring.append((frame, speech))
            if sum(not s for _, s in ring) > 0.9 * ring.maxlen:
                triggered = False
                yield b"".join(voiced)
                voiced, ring = [], collections.deque(maxlen=num_padding)

    if voiced:
        yield b"".join(voiced)


for i, chunk in enumerate(voiced_segments("call.wav")):
    print(f"segment {i}: {len(chunk)} bytes")
    # hand chunk to your transcriber here