python
import numpy as np
from sklearn.cluster import SpectralClustering
def estimate_speakers(embeddings: np.ndarray, max_k: int = 8):
# L2-normalize so the dot product is cosine similarity.
x = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
sim = x @ x.T
# CLAMP negatives to 0. Do NOT rescale [-1, 1] to [0, 1]:
# rescaling maps near-orthogonal (different-speaker) pairs at ~0
# up to 0.5, inflating cross-speaker edges and washing out the
# eigengap. Clamping leaves different-speaker pairs near 0.
affinity = np.clip(sim, 0.0, 1.0)
np.fill_diagonal(affinity, 1.0)
# Normalized (symmetric) graph Laplacian: L = I - D^-1/2 A D^-1/2.
deg = affinity.sum(axis=1)
d_inv_sqrt = 1.0 / np.sqrt(np.maximum(deg, 1e-12))
lap = np.eye(affinity.shape[0]) - (
affinity * d_inv_sqrt[:, None] * d_inv_sqrt[None, :]
)
eigvals = np.sort(np.linalg.eigvalsh(lap))
upper = min(max_k, len(eigvals) - 1)
gaps = np.diff(eigvals[: upper + 1])
k = int(np.argmax(gaps)) + 1
if k < 2:
return 1, np.zeros(affinity.shape[0], dtype=int)
labels = SpectralClustering(
n_clusters=k, affinity="precomputed", random_state=0
).fit_predict(affinity)
return k, labels