PaddlePaddle
diff --git a/‎examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
Lines changed: 19 additions & 7 deletions b/‎examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
Lines changed: 19 additions & 7 deletions
diff --git a/‎examples/voxceleb/sv0/local/data.sh
Lines changed: 18 additions & 0 deletions b/‎examples/voxceleb/sv0/local/data.sh
Lines changed: 18 additions & 0 deletions
diff --git a/‎examples/voxceleb/sv0/local/data_prepare.py
Lines changed: 20 additions & 19 deletions b/‎examples/voxceleb/sv0/local/data_prepare.py
Lines changed: 20 additions & 19 deletions
diff --git a/‎examples/voxceleb/sv0/local/emb.sh
Lines changed: 13 additions & 0 deletions b/‎examples/voxceleb/sv0/local/emb.sh
Lines changed: 13 additions & 0 deletions
diff --git a/‎examples/voxceleb/sv0/local/test.sh
Lines changed: 8 additions & 0 deletions b/‎examples/voxceleb/sv0/local/test.sh
Lines changed: 8 additions & 0 deletions
diff --git a/‎examples/voxceleb/sv0/local/train.sh
Lines changed: 22 additions & 0 deletions b/‎examples/voxceleb/sv0/local/train.sh
Lines changed: 22 additions & 0 deletions
diff --git a/‎examples/voxceleb/sv0/run.sh
Lines changed: 20 additions & 30 deletions b/‎examples/voxceleb/sv0/run.sh
Lines changed: 20 additions & 30 deletions
diff --git a/‎paddleaudio/paddleaudio/datasets/rirs_noises.py
Lines changed: 6 additions & 9 deletions b/‎paddleaudio/paddleaudio/datasets/rirs_noises.py
Lines changed: 6 additions & 9 deletions
@@ -1,7 +1,10 @@
 ###########################################
 #                Data                 #
 ###########################################
-batch_size: 32
+# we should explicitly specify the wav path of vox2 audio data converted from m4a
+vox2_base_path: 
+augment: True
+batch_size: 16
 num_workers: 2
 num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
 shuffle: True
@@ -11,10 +14,10 @@ random_chunk: True
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
 # currently, we only support fbank
-feature:
-  n_mels: 80
-  window_size: 400     #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 
-  hop_length: 160     #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
+sample_rate: 16000
+n_mels: 80
+window_size: 400     #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 
+hop_length: 160     #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
 
 ###########################################################
 #                       MODEL SETTING                     #
@@ -35,6 +38,15 @@ model:
 ###########################################
 seed: 1986 # according from speechbrain configuration
 epochs: 10
-save_interval: 10
-log_interval: 10
+save_interval: 1
+log_interval: 1
 learning_rate: 1e-8
+
+
+###########################################
+#                Testing                  #
+###########################################
+global_embedding_norm: True
+embedding_mean_norm: True
+embedding_std_norm: False
+
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+stage=-1
+stop_stage=100
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+dir=$1
+conf_path=$2
+mkdir -p ${dir}
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
+    # we should use the local/convert.sh convert m4a to wav
+    python3 local/data_prepare.py \
+                        --data-dir ${dir} \
+                        --config ${conf_path}
+fi 
@@ -14,57 +14,58 @@
 import argparse
 import os
 
-import numpy as np
 import paddle
+from yacs.config import CfgNode
 
-from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb
+from paddleaudio.datasets.voxceleb import VoxCeleb
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.augment import build_augment_pipeline
 from paddlespeech.vector.training.seeding import seed_everything
 
 logger = Log(__name__).getlog()
 
 
-def main(args):
+def main(args, config):
 
     # stage0: set the cpu device, all data prepare process will be done in cpu mode
     paddle.set_device("cpu")
     # set the random seed, it is a must for multiprocess training
-    seed_everything(args.seed)
+    seed_everything(config.seed)
 
     # stage 1: generate the voxceleb csv file
     # Note: this may occurs c++ execption, but the program will execute fine
     # so we ignore the execption 
     # we explicitly pass the vox2 base path to data prepare and generate the audio info
+    logger.info("start to generate the voxceleb dataset info")
     train_dataset = VoxCeleb(
-        'train', target_dir=args.data_dir, vox2_base_path=args.vox2_base_path)
-    dev_dataset = VoxCeleb(
-        'dev', target_dir=args.data_dir, vox2_base_path=args.vox2_base_path)
+        'train', target_dir=args.data_dir, vox2_base_path=config.vox2_base_path)
 
     # stage 2: generate the augment noise csv file
-    if args.augment:
+    if config.augment:
+        logger.info("start to generate the augment dataset info")
         augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
 
 
 if __name__ == "__main__":
     # yapf: disable
     parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument("--seed",
-                        default=0,
-                        type=int,
-                        help="random seed for paddle, numpy and python random package")
     parser.add_argument("--data-dir",
                         default="./data/",
                         type=str,
                         help="data directory")
-    parser.add_argument("--vox2-base-path",
+    parser.add_argument("--config",
                         default=None,
                         type=str,
-                        help="vox2 base path, where is store the wav audio")
-    parser.add_argument("--augment",
-                        action="store_true",
-                        default=False,
-                        help="Apply audio augments.")
+                        help="configuration file")
     args = parser.parse_args()
     # yapf: enable
-    main(args)
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    config.freeze()
+    print(config)
+
+    main(args, config)
@@ -0,0 +1,13 @@
+#!/bin/bash
+. ./path.sh
+
+exp_dir=exp/ecapa-tdnn-vox12-big//epoch_10/            # experiment directory
+conf_path=conf/ecapa_tdnn.yaml
+audio_path="demo/voxceleb/00001.wav"
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+# extract the audio embedding
+python3 ${BIN_DIR}/extract_emb.py --device "gpu" \
+          --config ${conf_path} \
+          --audio-path ${audio_path} --load-checkpoint ${exp_dir}
@@ -0,0 +1,8 @@
+dir=$1
+exp_dir=$2
+conf_path=$3
+
+python3 ${BIN_DIR}/test.py \
+        --config ${conf_path} \
+        --data-dir ${dir} \
+        --load-checkpoint ${exp_dir}
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+dir=$1
+exp_dir=$2
+conf_path=$3
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+# train the speaker identification task with voxceleb data
+# Note: we will store the log file in exp/log directory
+python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \
+    ${BIN_DIR}/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \
+    --data-dir ${dir} --config ${conf_path}
+
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
@@ -18,7 +18,7 @@ set -e
 
 #######################################################################
 # stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
-#          voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md
+#          voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md with the script local/convert.sh
 # stage 1: train the speaker identification model
 # stage 2: test speaker identification 
 # stage 3: extract the training embeding to train the LDA and PLDA
@@ -30,49 +30,39 @@ set -e
 # and put all of them to ${PPAUDIO_HOME}/datasets/vox2
 # we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav
 # export PPAUDIO_HOME=
-
 stage=0
+stop_stage=50
+
 # data directory
 # if we set the variable ${dir}, we will store the wav info to this directory
 # otherwise, we will store the wav info to vox1 and vox2 directory respectively
-dir=data/                          
-exp_dir=exp/ecapa-tdnn/            # experiment directory
-
 # vox2 wav path, we must convert the m4a format to wav format 
-# and store them in the ${PPAUDIO_HOME}/datasets/vox2/wav/ directory
-vox2_base_path=${PPAUDIO_HOME}/datasets/vox2/wav/
-mkdir -p ${dir}
+# dir=data-demo/                          # data info directory    
+dir=demo/                          # data info directory   
+
+exp_dir=exp/ecapa-tdnn-vox12-big//            # experiment directory
+conf_path=conf/ecapa_tdnn.yaml          
+gpus=0,1,2,3
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
 mkdir -p ${exp_dir}
 
-if [ $stage -le 0 ]; then 
+if [ $stage -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
      # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
-     python3 local/data_prepare.py \
-     --data-dir ${dir} --augment --vox2-base-path ${vox2_base_path} \
-     --config conf/ecapa_tdnn.yaml
+     # and we should specifiy the vox2 data in the data.sh
+     bash ./local/data.sh ${dir} ${conf_path}|| exit -1;
 fi 
 
-if [ $stage -le 1 ]; then
+if [ $stage -le 1 ] && [ ${stop_stage} -ge 1 ]; then
      # stage 1: train the speaker identification model
-     python3 \
-          -m paddle.distributed.launch --gpus=0,1,2,3 \
-          ${BIN_DIR}/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \
-          --data-dir ${dir} --config conf/ecapa_tdnn.yaml
+     CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${dir} ${exp_dir} ${conf_path} 
 fi
 
 if [ $stage -le 2 ]; then
-     # stage 1: get the speaker verification scores with cosine function
-     python3 \
-          ${BIN_DIR}/speaker_verification_cosine.py\
-          --config conf/ecapa_tdnn.yaml \
-          --data-dir ${dir} --load-checkpoint ${exp_dir}/epoch_10/
-fi
-
-if [ $stage -le 3 ]; then
-     # stage 3: extract the audio embedding
-     python3 \
-          ${BIN_DIR}/extract_speaker_embedding.py\
-          --config conf/ecapa_tdnn.yaml \
-          --audio-path "demo/csv/00001.wav" --load-checkpoint ${exp_dir}/epoch_60/
+     # stage 2: get the speaker verification scores with cosine function
+     #          now we only support use cosine to get the scores
+     CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ${dir} ${exp_dir} ${conf_path}
 fi
 
 # if [ $stage -le 3 ]; then
 
@@ -25,13 +25,10 @@
 
 from ..backends import load as load_audio
 from ..backends import save as save_wav
-from .dataset import feat_funcs
 from ..utils import DATA_HOME
 from ..utils import decompress
-from paddlespeech.s2t.utils.log import Log
-from paddlespeech.vector.utils.download import download_and_decompress
-
-logger = Log(__name__).getlog()
+from ..utils.download import download_and_decompress
+from .dataset import feat_funcs
 
 __all__ = ['OpenRIRNoise']
 
@@ -80,17 +77,17 @@ def __init__(self,
 
     def _get_data(self):
         # Download audio files.
-        logger.info(f"rirs noises base path: {self.base_path}")
+        print(f"rirs noises base path: {self.base_path}")
         if not os.path.isdir(self.base_path):
             download_and_decompress(
                 self.archieves, self.base_path, decompress=True)
         else:
-            logger.info(
+            print(
                 f"{self.base_path} already exists, we will not download and decompress again"
             )
 
         # Data preparation.
-        logger.info(f"prepare the csv to {self.csv_path}")
+        print(f"prepare the csv to {self.csv_path}")
         if not os.path.isdir(self.csv_path):
             os.makedirs(self.csv_path)
             self.prepare_data()
@@ -161,7 +158,7 @@ def generate_csv(self,
                      wav_files: List[str],
                      output_file: str,
                      split_chunks: bool=True):
-        logger.info(f'Generating csv: {output_file}')
+        print(f'Generating csv: {output_file}')
         header = ["id", "duration", "wav"]
 
         infos = list(