PaddlePaddle
diff --git a/‎dataset/voxceleb/voxceleb1.py
Lines changed: 36 additions & 9 deletions b/‎dataset/voxceleb/voxceleb1.py
Lines changed: 36 additions & 9 deletions
diff --git a/‎dataset/voxceleb/voxceleb2.py
Lines changed: 163 additions & 0 deletions b/‎dataset/voxceleb/voxceleb2.py
Lines changed: 163 additions & 0 deletions
diff --git a/‎examples/voxceleb/README.md
Lines changed: 48 additions & 0 deletions b/‎examples/voxceleb/README.md
Lines changed: 48 additions & 0 deletions
diff --git a/‎examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
Lines changed: 52 additions & 0 deletions b/‎examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
Lines changed: 52 additions & 0 deletions
@@ -59,12 +59,17 @@
 TEST_LIST = {"vox1_test_wav.zip": "185fdc63c3c739954633d50379a3d102"}
 TEST_TARGET_DATA = "vox1_test_wav.zip vox1_test_wav.zip 185fdc63c3c739954633d50379a3d102"
 
-# kaldi trial
-# this trial file is organized by kaldi according the official file,
-# which is a little different with the official trial veri_test2.txt
-KALDI_BASE_URL = "http://www.openslr.org/resources/49/"
-TRIAL_LIST = {"voxceleb1_test_v2.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7"}
-TRIAL_TARGET_DATA = "voxceleb1_test_v2.txt voxceleb1_test_v2.txt 29fc7cc1c5d59f0816dc15d6e8be60f7"
+# voxceleb trial
+
+TRIAL_BASE_URL = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/"
+TRIAL_LIST = {
+    "veri_test.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7",             # voxceleb1
+    "veri_test2.txt": "b73110731c9223c1461fe49cb48dddfc",            # voxceleb1(cleaned)
+    "list_test_hard.txt": "21c341b6b2168eea2634df0fb4b8fff1",        # voxceleb1-H
+    "list_test_hard2.txt": "857790e09d579a68eb2e339a090343c8",       # voxceleb1-H(cleaned)
+    "list_test_all.txt": "b9ecf7aa49d4b656aa927a8092844e4a",         # voxceleb1-E
+    "list_test_all2.txt": "a53e059deb562ffcfc092bf5d90d9f3a"         # voxceleb1-E(cleaned)
+    }
 
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
@@ -82,7 +87,7 @@
 
 
 def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
+    print(f"Creating manifest {manifest_path_prefix} from {data_dir}")
     json_lines = []
     data_path = os.path.join(data_dir, "wav", "**", "*.wav")
     total_sec = 0.0
@@ -114,6 +119,9 @@ def create_manifest(data_dir, manifest_path_prefix):
     # voxceleb1 is given explicit in the path
     data_dir_name = Path(data_dir).name
     manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
+    if not os.path.exists(os.path.dirname(manifest_path_prefix)):
+        os.makedirs(os.path.dirname(manifest_path_prefix))
+
     with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
         for line in json_lines:
             f.write(line + "\n")
@@ -133,11 +141,13 @@ def create_manifest(data_dir, manifest_path_prefix):
 def prepare_dataset(base_url, data_list, target_dir, manifest_path,
                     target_data):
     if not os.path.exists(target_dir):
-        os.mkdir(target_dir)
+        os.makedirs(target_dir)
 
     # wav directory already exists, it need do nothing
+    # we will download the voxceleb1 data to ${target_dir}/vox1/dev/ or ${target_dir}/vox1/test directory 
     if not os.path.exists(os.path.join(target_dir, "wav")):
         # download all dataset part
+        print("start to download the vox1 dev zip package")
         for zip_part in data_list.keys():
             download_url = " --no-check-certificate " + base_url + "/" + zip_part
             download(
@@ -166,25 +176,42 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path,
     # create the manifest file
     create_manifest(data_dir=target_dir, manifest_path_prefix=manifest_path)
 
+def prepare_trial(base_url, data_list, target_dir):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
 
+    for trial, md5sum in data_list.items():
+        target_trial = os.path.join(target_dir, trial)
+        if not os.path.exists(os.path.join(target_dir, trial)):
+            download_url = " --no-check-certificate " + base_url + "/" + trial
+            download(url=download_url, md5sum=md5sum, target_dir=target_dir)
 def main():
     if args.target_dir.startswith('~'):
         args.target_dir = os.path.expanduser(args.target_dir)
-
+    
+    # prepare the vox1 dev data
     prepare_dataset(
         base_url=BASE_URL,
         data_list=DEV_LIST,
         target_dir=os.path.join(args.target_dir, "dev"),
         manifest_path=args.manifest_prefix,
         target_data=DEV_TARGET_DATA)
 
+    # prepare the vox1 test data
     prepare_dataset(
         base_url=BASE_URL,
         data_list=TEST_LIST,
         target_dir=os.path.join(args.target_dir, "test"),
         manifest_path=args.manifest_prefix,
         target_data=TEST_TARGET_DATA)
 
+    # prepare the vox1 trial
+    prepare_trial(
+        base_url=TRIAL_BASE_URL,
+        data_list=TRIAL_LIST,
+        target_dir=os.path.dirname(args.manifest_prefix)
+    )
+
     print("Manifest prepare done!")
 
 
 
@@ -0,0 +1,163 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare VoxCeleb2 dataset
+
+Download and unpack the voxceleb2 data files.
+Voxceleb2 data is stored as the m4a format, 
+so we need convert the m4a to wav with the convert.sh scripts
+"""
+import argparse
+import codecs
+import glob
+import json
+import os
+import subprocess
+from pathlib import Path
+
+import soundfile
+
+from utils.utility import check_md5sum
+from utils.utility import download
+from utils.utility import unzip
+
+# all the data will be download in the current data/voxceleb directory default
+DATA_HOME = os.path.expanduser('.')
+
+BASE_URL = "--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/"
+
+# dev data
+DEV_DATA_URL = BASE_URL + '/vox2_aac.zip'
+DEV_MD5SUM = "bbc063c46078a602ca71605645c2a402"
+
+
+# test data
+TEST_DATA_URL = BASE_URL +  '/vox2_test_aac.zip'
+TEST_MD5SUM = "0d2b3ea430a821c33263b5ea37ede312"
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/voxceleb2/",
+    type=str,
+    help="Directory to save the voxceleb1 dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+parser.add_argument("--download", 
+                    default=False, 
+                    action="store_true", 
+                    help="Download the voxceleb2 dataset. (default: %(default)s)")
+parser.add_argument("--generate", 
+                    default=False, 
+                    action="store_true", 
+                    help="Generate the manifest files. (default: %(default)s)")
+
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    data_path = os.path.join(data_dir, "**", "*.wav")
+    total_sec = 0.0
+    total_text = 0.0
+    total_num = 0
+    speakers = set()
+    for audio_path in glob.glob(data_path, recursive=True):
+        audio_id = "-".join(audio_path.split("/")[-3:])
+        utt2spk = audio_path.split("/")[-3]
+        duration = soundfile.info(audio_path).duration
+        text = ""
+        json_lines.append(
+            json.dumps(
+                {
+                    "utt": audio_id,
+                    "utt2spk": str(utt2spk),
+                    "feat": audio_path,
+                    "feat_shape": (duration, ),
+                    "text": text  # compatible with asr data format
+                },
+                ensure_ascii=False))
+
+        total_sec += duration
+        total_text += len(text)
+        total_num += 1
+        speakers.add(utt2spk)
+
+    # data_dir_name refer to dev or test
+    # voxceleb2 is given explicit in the path
+    data_dir_name = Path(data_dir).name
+    manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
+
+    if not os.path.exists(os.path.dirname(manifest_path_prefix)):
+        os.makedirs(os.path.dirname(manifest_path_prefix))
+    with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
+        for line in json_lines:
+            f.write(line + "\n")
+
+    manifest_dir = os.path.dirname(manifest_path_prefix)
+    meta_path = os.path.join(manifest_dir, "voxceleb2." +
+                             data_dir_name) + ".meta"
+    with codecs.open(meta_path, 'w', encoding='utf-8') as f:
+        print(f"{total_num} utts", file=f)
+        print(f"{len(speakers)} speakers", file=f)
+        print(f"{total_sec / (60 * 60)} h", file=f)
+        print(f"{total_text} text", file=f)
+        print(f"{total_text / total_sec} text/sec", file=f)
+        print(f"{total_sec / total_num} sec/utt", file=f)
+
+
+def download_dataset(url, md5sum, target_dir, dataset):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+
+    # wav directory already exists, it need do nothing
+    print("target dir {}".format(os.path.join(target_dir, dataset)))
+    # unzip the dev dataset will create the dev and unzip the m4a to dev dir
+    # but the test dataset will unzip to aac
+    # so, wo create the ${target_dir}/test and unzip the m4a to test dir
+    if not os.path.exists(os.path.join(target_dir, dataset)):
+        filepath = download(url, md5sum, target_dir)
+        if dataset == "test":
+            unzip(filepath, os.path.join(target_dir, "test"))
+
+
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+    
+    # download and unpack the vox2-dev data
+    print("download: {}".format(args.download))
+    if args.download:
+        download_dataset(
+            url=DEV_DATA_URL,
+            md5sum=DEV_MD5SUM,
+            target_dir=args.target_dir,
+            dataset="dev")
+
+        download_dataset(
+            url=TEST_DATA_URL,
+            md5sum=TEST_MD5SUM,
+            target_dir=args.target_dir,
+            dataset="test")
+
+        print("VoxCeleb2 download is done!")
+
+    if args.generate:
+        create_manifest(args.target_dir, manifest_path_prefix=args.manifest_prefix)
+
+if __name__ == '__main__':
+    main()
@@ -6,3 +6,51 @@ sv0 - speaker verfication with softmax backend etc, all python code
 
 sv1 - dependence on kaldi, speaker verfication with plda/sc backend, 
       more info refer to the sv1/readme.txt
+
+
+## VoxCeleb2 preparation
+
+VoxCeleb2 audio files are released in m4a format. All the VoxCeleb2 m4a audio files must be converted in wav files before feeding them in PaddleSpeech. 
+Please, follow these steps to prepare the dataset correctly:
+
+1. Download Voxceleb2.
+You can find download instructions here: http://www.robots.ox.ac.uk/~vgg/data/voxceleb/
+
+2. Convert .m4a to wav
+VoxCeleb2 stores files with the m4a audio format. To use them in PaddleSpeech,  you have to convert all the m4a audio files into wav files.
+
+``` shell
+ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s
+```
+
+You can do the conversion using ffmpeg  https://gist.github.com/seungwonpark/4f273739beef2691cd53b5c39629d830). This operation might take several hours and should be only once.
+
+3. Put all the wav files in a folder called `wav`. You should have something like `voxceleb2/wav/id*/*.wav` (e.g, `voxceleb2/wav/id00012/21Uxsk56VDQ/00001.wav`)
+
+
+## voxceleb dataset summary
+
+
+|dataset | vox1 - dev | vox1 - test |vox2 - dev| vox2 - test|
+|---------|-----------|------------|-----------|----------|
+|spks    |  1211       |40     |      5994        | 118|
+|utts     | 148642    | 4874   | 1092009     |36273|
+| time(h) | 340.4 | 11.2  | 2360.2  |79.9 |
+
+
+## trial summary
+
+| trial     | filename |  nums | positive | negative |
+|--------|-----------|--------|-------|------|
+| VoxCeleb1 | veri_test.txt | 37720 | 18860 | 18860 | 
+| VoxCeleb1(cleaned) | veri_test2.txt | 37611 | 18802 | 18809 |
+| VoxCeleb1-H | list_test_hard.txt | 552536 | 276270 | 276266 |
+|VoxCeleb1-H(cleaned) |list_test_hard2.txt | 550894 | 275488 | 275406 |
+|VoxCeleb1-E | list_test_all.txt | 581480 | 290743 | 290737 | 
+|VoxCeleb1-E(cleaned) | list_test_all2.txt |579818 |289921 |289897 |
+
+
+
+
+
+
@@ -0,0 +1,52 @@
+###########################################
+#                Data                 #
+###########################################
+# we should explicitly specify the wav path of vox2 audio data converted from m4a
+vox2_base_path: 
+augment: True
+batch_size: 16
+num_workers: 2
+num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
+shuffle: True
+random_chunk: True
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+# currently, we only support fbank
+sr: 16000           # sample rate
+n_mels: 80
+window_size: 400     #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 
+hop_size: 160        #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+# currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml
+# if we want use another model, please choose another configuration yaml file
+model:
+  input_size: 80
+  # "channels": [512, 512, 512, 512, 1536],
+  channels: [1024, 1024, 1024, 1024, 3072]
+  kernel_sizes: [5, 3, 3, 3, 1]
+  dilations: [1, 2, 3, 4, 1]
+  attention_channels: 128
+  lin_neurons: 192
+
+###########################################
+#                Training                 #
+###########################################
+seed: 1986 # according from speechbrain configuration
+epochs: 10
+save_interval: 1
+log_interval: 1
+learning_rate: 1e-8
+
+
+###########################################
+#                Testing                  #
+###########################################
+global_embedding_norm: True
+embedding_mean_norm: True
+embedding_std_norm: False
+