PaddlePaddle · LeoMax-Xiong · Mar 24, 2022 · Feb 24, 2022 · Feb 25, 2022 · Feb 25, 2022
diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md
@@ -23,39 +23,6 @@ VoxCeleb2 stores files with the m4a audio format. To use them in PaddleSpeech,
 ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s
 ```
 
-``` shell
-# copy this to root directory of data and 
-# chmod a+x convert.sh
-# ./convert.sh
-# https://unix.stackexchange.com/questions/103920/parallelize-a-bash-for-loop
-
-open_sem(){
-    mkfifo pipe-$$
-    exec 3<>pipe-$$
-    rm pipe-$$
-    local i=$1
-    for((;i>0;i--)); do
-        printf %s 000 >&3
-    done
-}
-run_with_lock(){
-    local x
-    read -u 3 -n 3 x && ((0==x)) || exit $x
-    (
-     ( "$@"; )
-    printf '%.3d' $? >&3
-    )&
-}
-
-N=32 # number of vCPU
-open_sem $N
-for f in $(find . -name "*.m4a"); do
-    run_with_lock ffmpeg -loglevel panic -i "$f" -ar 16000 "${f%.*}.wav"
-done
-```
-
 You can do the conversion using ffmpeg  https://gist.github.com/seungwonpark/4f273739beef2691cd53b5c39629d830). This operation might take several hours and should be only once.
 
 3. Put all the wav files in a folder called `wav`. You should have something like `voxceleb2/wav/id*/*.wav` (e.g, `voxceleb2/wav/id00012/21Uxsk56VDQ/00001.wav`)
-
-4. 
diff --git a/examples/voxceleb/sv0/local/data_prepare.py b/examples/voxceleb/sv0/local/data_prepare.py
@@ -1,32 +1,51 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 import os
 
 import numpy as np
 import paddle
 
-from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb1
+from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.augment import build_augment_pipeline
 from paddlespeech.vector.training.seeding import seed_everything
 
 logger = Log(__name__).getlog()
 
+
 def main(args):
+
     # stage0: set the cpu device, all data prepare process will be done in cpu mode
     paddle.set_device("cpu")
     # set the random seed, it is a must for multiprocess training
     seed_everything(args.seed)
 
     # stage 1: generate the voxceleb csv file
     # Note: this may occurs c++ execption, but the program will execute fine
-    # so we can ignore the execption 
-    train_dataset = VoxCeleb1('train', target_dir=args.data_dir)
-    dev_dataset = VoxCeleb1('dev', target_dir=args.data_dir)
+    # so we ignore the execption 
+    # we explicitly pass the vox2 base path to data prepare and generate the audio info
+    train_dataset = VoxCeleb(
+        'train', target_dir=args.data_dir, vox2_base_path=args.vox2_base_path)
+    dev_dataset = VoxCeleb(
+        'dev', target_dir=args.data_dir, vox2_base_path=args.vox2_base_path)
 
     # stage 2: generate the augment noise csv file
     if args.augment:
         augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
 
+
 if __name__ == "__main__":
     # yapf: disable
     parser = argparse.ArgumentParser(__doc__)
@@ -38,10 +57,14 @@ def main(args):
                         default="./data/",
                         type=str,
                         help="data directory")
+    parser.add_argument("--vox2-base-path",
+                        default=None,
+                        type=str,
+                        help="vox2 base path, where is store the wav audio")
     parser.add_argument("--augment",
                         action="store_true",
                         default=False,
                         help="Apply audio augments.")
     args = parser.parse_args()
     # yapf: enable
-    main(args)                    
+    main(args)
diff --git a/examples/voxceleb/sv0/path.sh b/examples/voxceleb/sv0/path.sh
@@ -1,3 +1,17 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
@@ -10,5 +24,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
 
-MODEL=ecapa-tdnn
+MODEL=ecapa_tdnn
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
@@ -1,4 +1,17 @@
 #!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 . ./path.sh
 set -e
@@ -11,19 +24,30 @@ set -e
 # stage 3: extract the training embeding to train the LDA and PLDA
 ######################################################################
 
-# you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
-# default the dataset is the ~/.paddleaudio/
+# we can set the variable PPAUDIO_HOME to specifiy the root directory of the downloaded vox1 and vox2 dataset 
+# default the dataset will be stored in the ~/.paddleaudio/
+# the vox2 dataset is stored in m4a format, we need to convert the audio from m4a to wav yourself
+# and put all of them to ${PPAUDIO_HOME}/datasets/vox2
+# we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav
 # export PPAUDIO_HOME=
 
 stage=0
-dir=data.bak/                     # data directory
-exp_dir=exp/ecapa-tdnn/           # experiment directory
+# data directory
+# if we set the variable ${dir}, we will store the wav info to this directory
+# otherwise, we will store the wav info to vox1 and vox2 directory respectively
+dir=data/                          
+exp_dir=exp/ecapa-tdnn/            # experiment directory
+
+# vox2 wav path, we must convert the m4a format to wav format 
+# and store them in the ${PPAUDIO_HOME}/datasets/vox2/wav/ directory
+vox2_base_path=${PPAUDIO_HOME}/datasets/vox2/wav/
 mkdir -p ${dir}
 mkdir -p ${exp_dir}
 
 if [ $stage -le 0 ]; then 
      # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
-     python3 local/data_prepare.py --data-dir ${dir} --augment
+     python3 local/data_prepare.py \
+     --data-dir ${dir} --augment --vox2-base-path ${vox2_base_path}
 fi 
 
 if [ $stage -le 1 ]; then

diff --git a/paddleaudio/paddleaudio/datasets/__init__.py b/paddleaudio/paddleaudio/datasets/__init__.py
@@ -15,5 +15,5 @@
 from .gtzan import GTZAN
 from .tess import TESS
 from .urban_sound import UrbanSound8K
-from .voxceleb import VoxCeleb1
+from .voxceleb import VoxCeleb
 from .rirs_noises import OpenRIRNoise
diff --git a/paddleaudio/paddleaudio/datasets/voxceleb.py b/paddleaudio/paddleaudio/datasets/voxceleb.py
@@ -25,21 +25,21 @@
 from pathos.multiprocessing import Pool
 from tqdm import tqdm
 
-from .dataset import feat_funcs
 from ..backends import load as load_audio
 from ..utils import DATA_HOME
 from ..utils import decompress
+from .dataset import feat_funcs
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.utils.download import download_and_decompress
 from utils.utility import download
 from utils.utility import unpack
 
 logger = Log(__name__).getlog()
 
-__all__ = ['VoxCeleb1']
+__all__ = ['VoxCeleb']
 
 
-class VoxCeleb1(Dataset):
+class VoxCeleb(Dataset):
     source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
     archieves_audio_dev = [
         {
@@ -94,8 +94,18 @@ def __init__(
             split_ratio: float=0.9,  # train split ratio
             seed: int=0,
             target_dir: str=None,
+            vox2_base_path=None,
             **kwargs):
-
+        """VoxCeleb data prepare and get the specific dataset audio info
+
+        Args:
+            subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.
+            feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.
+            random_chunk (bool, optional): random select a duration from audio. Defaults to True.
+            chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.
+            target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.
+            vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.
+        """
         assert subset in self.subsets, \
             'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
 
@@ -106,19 +116,20 @@ def __init__(
         self.random_chunk = random_chunk
         self.chunk_duration = chunk_duration
         self.split_ratio = split_ratio
-        self.target_dir = target_dir if target_dir else VoxCeleb1.base_path
+        self.target_dir = target_dir if target_dir else VoxCeleb.base_path
+        self.vox2_base_path = vox2_base_path
 
         # if we set the target dir, we will change the vox data info data from base path to target dir
-        VoxCeleb1.csv_path = os.path.join(
-            target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb1.csv_path
-        VoxCeleb1.meta_path = os.path.join(
+        VoxCeleb.csv_path = os.path.join(
+            target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path
+        VoxCeleb.meta_path = os.path.join(
             target_dir, "voxceleb",
-            'meta') if target_dir else VoxCeleb1.meta_path
-        VoxCeleb1.veri_test_file = os.path.join(VoxCeleb1.meta_path,
-                                                'veri_test2.txt')
+            'meta') if target_dir else VoxCeleb.meta_path
+        VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,
+                                               'veri_test2.txt')
         # self._data = self._get_data()[:1000]  # KP: Small dataset test.
         self._data = self._get_data()
-        super(VoxCeleb1, self).__init__()
+        super(VoxCeleb, self).__init__()
 
         # Set up a seed to reproduce training or predicting result.
         # random.seed(seed)
@@ -300,7 +311,14 @@ def prepare_data(self):
         # get all the train and dev audios file path
         audio_files = []
         speakers = set()
-        for path in [self.wav_path]:
+        for path in [self.wav_path, self.vox2_base_path]:
+            # if vox2 directory is not set and vox2 is not a directory 
+            # we will not process this directory
+            if not path or not os.path.exists(path):
+                logger.warning(
+                    f"{path} is an invalid path, please check again, "
+                    "and we will ignore the vox2 base path")
+                continue
             for file in glob.glob(
                     os.path.join(path, "**", "*.wav"), recursive=True):
                 spk = file.split('/wav/')[1].split('/')[0]

diff --git a/...s/ecapa-tdnn/extract_speaker_embedding.py → ...s/ecapa_tdnn/extract_speaker_embedding.py b/...s/ecapa-tdnn/extract_speaker_embedding.py → ...s/ecapa_tdnn/extract_speaker_embedding.py
@@ -28,6 +28,7 @@
 
 logger = Log(__name__).getlog()
 
+
 def extract_audio_embedding(args, config):
     # stage 0: set the training device, cpu or gpu
     paddle.set_device(args.device)
@@ -83,7 +84,7 @@ def extract_audio_embedding(args, config):
                         choices=['cpu', 'gpu'],
                         default="gpu",
                         help="Select which device to train model, defaults to gpu.")
-    parser.add_argument("--config", 
+    parser.add_argument("--config",
                         default=None,
                         type=str,
                         help="configuration file")

diff --git a/...ecapa-tdnn/speaker_verification_cosine.py → ...ecapa_tdnn/speaker_verification_cosine.py b/...ecapa-tdnn/speaker_verification_cosine.py → ...ecapa_tdnn/speaker_verification_cosine.py
@@ -17,22 +17,23 @@
 
 import numpy as np
 import paddle
-from yacs.config import CfgNode
 import paddle.nn.functional as F
 from paddle.io import BatchSampler
 from paddle.io import DataLoader
 from tqdm import tqdm
+from yacs.config import CfgNode
 
-from paddleaudio.paddleaudio.datasets import VoxCeleb1
-from paddlespeech.s2t.utils.log import Log
+from paddleaudio.paddleaudio.datasets import VoxCeleb
 from paddleaudio.paddleaudio.metric import compute_eer
+from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.batch import batch_feature_normalize
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 from paddlespeech.vector.training.seeding import seed_everything
 
 logger = Log(__name__).getlog()
 
+
 def main(args, config):
     # stage0: set the training device, cpu or gpu
     paddle.set_device(args.device)
@@ -44,7 +45,7 @@ def main(args, config):
 
     # stage2: build the speaker verification eval instance with backbone model
     model = SpeakerIdetification(
-        backbone=ecapa_tdnn, num_class=VoxCeleb1.num_speakers)
+        backbone=ecapa_tdnn, num_class=VoxCeleb.num_speakers)
 
     # stage3: load the pre-trained model
     args.load_checkpoint = os.path.abspath(
@@ -57,7 +58,7 @@ def main(args, config):
     logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
 
     # stage4: construct the enroll and test dataloader
-    enroll_dataset = VoxCeleb1(
+    enroll_dataset = VoxCeleb(
         subset='enroll',
         target_dir=args.data_dir,
         feat_type='melspectrogram',
@@ -73,7 +74,7 @@ def main(args, config):
                     num_workers=config.num_workers,
                     return_list=True,)
 
-    test_dataset = VoxCeleb1(
+    test_dataset = VoxCeleb(
         subset='test',
         target_dir=args.data_dir,
         feat_type='melspectrogram',
@@ -145,7 +146,7 @@ def main(args, config):
     labels = []
     enrol_ids = []
     test_ids = []
-    with open(VoxCeleb1.veri_test_file, 'r') as f:
+    with open(VoxCeleb.veri_test_file, 'r') as f:
         for line in f.readlines():
             label, enrol_id, test_id = line.strip().split(' ')
             labels.append(int(label))
@@ -171,7 +172,7 @@ def main(args, config):
                         choices=['cpu', 'gpu'],
                         default="gpu",
                         help="Select which device to train model, defaults to gpu.")
-    parser.add_argument("--config", 
+    parser.add_argument("--config",
                         default=None,
                         type=str,
                         help="configuration file")