Add voice matching support for Dia (#93)

lucasnewman · Blaizzy · web-flow · commit 77aaefad5359 · 2025-04-26T12:07:29.000+02:00
* Add voice matching support for Dia.

* Fix codec test failures from MLX update.

---------

Co-authored-by: Prince Canuma &lt;prince.gdt@gmail.com&gt;
diff --git a/mlx_audio/codec/tests/test_descript.py b/mlx_audio/codec/tests/test_descript.py
@@ -38,7 +38,7 @@ def test_descript_16khz(self):
         self.assertEqual(latents.shape, (1, 96, 250))
 
         y = model.decode(z).squeeze(-1)
-        self.assertEqual(y.shape, (1, 79_992))
+        self.assertEqual(y.shape, (1, 80_043))
 
     def test_descript_24khz(self):
         audio = mx.zeros((1, 1, 120_000))
@@ -70,7 +70,7 @@ def test_descript_24khz(self):
         self.assertEqual(latents.shape, (1, 256, 375))
 
         y = model.decode(z).squeeze(-1)
-        self.assertEqual(y.shape, (1, 119_992))
+        self.assertEqual(y.shape, (1, 120_043))
 
     def test_descript_44khz(self):
         audio = mx.zeros((1, 1, 220_000))
@@ -102,7 +102,7 @@ def test_descript_44khz(self):
         self.assertEqual(latents.shape, (1, 72, 430))
 
         y = model.decode(z).squeeze(-1)
-        self.assertEqual(y.shape, (1, 220_160))
+        self.assertEqual(y.shape, (1, 220_235))
 
 
 if __name__ == "__main__":
diff --git a/mlx_audio/codec/tests/test_snac.py b/mlx_audio/codec/tests/test_snac.py
@@ -33,7 +33,7 @@ def test_snac(self):
         self.assertEqual(codes[2].shape, (1, 236))
 
         reconstructed = model.decode(codes).squeeze(-1)
-        self.assertEqual(reconstructed.shape, (1, 120_832))
+        self.assertEqual(reconstructed.shape, (1, 120_907))
 
 
 if __name__ == "__main__":
diff --git a/mlx_audio/tts/generate.py b/mlx_audio/tts/generate.py
@@ -74,7 +74,7 @@ def generate_audio(
         if ref_audio:
             if not os.path.exists(ref_audio):
                 raise FileNotFoundError(f"Reference audio file not found: {ref_audio}")
-            ref_audio = load_audio(ref_audio)
+            ref_audio = load_audio(ref_audio, sample_rate=sample_rate)
             if not ref_text:
                 print("Ref_text not found. Transcribing ref_audio...")
                 # mlx_whisper seems takes long time to import. Import only necessary.
diff --git a/mlx_audio/tts/models/dia/dia.py b/mlx_audio/tts/models/dia/dia.py
@@ -4,10 +4,8 @@
 import mlx.core as mx
 import mlx.nn as nn
 import numpy as np
-import soundfile as sf
 from huggingface_hub import hf_hub_download
 from mlx_lm.sample_utils import make_sampler
-from scipy import signal
 from tqdm import trange
 
 from mlx_audio.codec.models import DAC
@@ -18,14 +16,6 @@
 from .layers import DiaModel, KVCache
 
 
-def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
-    gcd = np.gcd(orig_sr, target_sr)
-    up = target_sr // gcd
-    down = orig_sr // gcd
-    resampled = signal.resample_poly(audio, up, down, padtype="edge")
-    return resampled
-
-
 def _sample_next_token(
     logits_BCxV: mx.array,
     temperature: float,
@@ -226,6 +216,8 @@ def generate(
         split_pattern: str = "\n",
         max_tokens: int | None = None,
         verbose: bool = False,
+        ref_audio: Optional[mx.array] = None,
+        ref_text: Optional[str] = None,
         **kwargs,
     ):
         prompt = text.replace("\\n", "\n").replace("\\t", "\t")
@@ -239,6 +231,8 @@ def generate(
             audio = self._generate(
                 prompt,
                 max_tokens=max_tokens,
+                ref_audio=ref_audio,
+                ref_text=ref_text,
             )
             all_audio.append(audio[None, ...])
 
@@ -291,13 +285,14 @@ def generate(
     def _generate(
         self,
         text: str,
-        max_tokens: int | None = None,
+        max_tokens: Optional[int] = None,
         cfg_scale: float = 3.0,
         temperature: float = 1.3,
         top_p: float = 0.95,
         use_cfg_filter: bool = True,
         cfg_filter_top_k: int = 35,
-        audio_prompt_path: str | None = None,
+        ref_audio: Optional[mx.array] = None,
+        ref_text: Optional[str] = None,
     ) -> np.ndarray:
         """
         Generates audio from a text prompt (and optional audio prompt) using the Dia model.
@@ -314,6 +309,9 @@ def _generate(
         delay_tensor = mx.array(delay_pattern, dtype=mx.int32)
         max_delay_pattern = max(delay_pattern)
 
+        if ref_text is not None:
+            text = ref_text.strip() + " " + text
+
         (
             cond_src_BxS,
             cond_src_positions_BxS,
@@ -370,19 +368,18 @@ def _generate(
         prompt_len_inc_bos = 1  # Start with BOS length
 
         # 3-3. Load Audio Prompt (if provided)
-        if audio_prompt_path is not None:
-            audio_prompt, sr = sf.read(audio_prompt_path)  # C, T
-            if sr != 44100:  # Resample to 44.1kHz
-                audio_prompt = resample_audio(audio_prompt, sr, 44100)
-            audio_prompt = audio_prompt.unsqueeze(0)  # 1, C, T
+        if ref_audio is not None:
+            audio_prompt = mx.array(ref_audio)[None, None, ...]  # 1, C, T
 
             audio_prompt_codebook = audio_to_codebook(
                 self.dac_model, audio_prompt, data_config=self.config.data
             )
-            audio_prompt_mx = mx.array(audio_prompt_codebook.numpy())
-
-            audio_prompt_mx = mx.concatenate([audio_prompt_mx, audio_prompt_mx], axis=0)
-            generated_BxTxC = mx.concatenate([generated_BxTxC, audio_prompt_mx], axis=1)
+            audio_prompt_codebook = mx.concatenate(
+                [audio_prompt_codebook, audio_prompt_codebook], axis=0
+            )
+            generated_BxTxC = mx.concatenate(
+                [generated_BxTxC, audio_prompt_codebook], axis=1
+            )
 
             prefill_len = generated_BxTxC.shape[1]
             prompt_len_inc_bos = prefill_len
@@ -499,7 +496,7 @@ def _generate(
             )
 
             generation_step_index = step - current_step
-            if audio_prompt_path is None:
+            if ref_audio is None:
                 pred_C = mx.where(
                     generation_step_index >= delay_tensor,
                     pred_C,