Skip to content

Commit 506d26a

Browse files
committed
change the code style to s2t code style, test=doc
1 parent 7eb8fa7 commit 506d26a

File tree

15 files changed

+216
-208
lines changed

15 files changed

+216
-208
lines changed

examples/voxceleb/sv0/conf/ecapa_tdnn.yaml

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
###########################################
22
# Data #
33
###########################################
4-
batch_size: 32
4+
# we should explicitly specify the wav path of vox2 audio data converted from m4a
5+
vox2_base_path:
6+
augment: True
7+
batch_size: 16
58
num_workers: 2
69
num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
710
shuffle: True
@@ -11,10 +14,10 @@ random_chunk: True
1114
# FEATURE EXTRACTION SETTING #
1215
###########################################################
1316
# currently, we only support fbank
14-
feature:
15-
n_mels: 80
16-
window_size: 400 #25ms, sample rate 16000, 25 * 16000 / 1000 = 400
17-
hop_length: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
17+
sample_rate: 16000
18+
n_mels: 80
19+
window_size: 400 #25ms, sample rate 16000, 25 * 16000 / 1000 = 400
20+
hop_length: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
1821

1922
###########################################################
2023
# MODEL SETTING #
@@ -35,6 +38,15 @@ model:
3538
###########################################
3639
seed: 1986 # according from speechbrain configuration
3740
epochs: 10
38-
save_interval: 10
39-
log_interval: 10
41+
save_interval: 1
42+
log_interval: 1
4043
learning_rate: 1e-8
44+
45+
46+
###########################################
47+
# Testing #
48+
###########################################
49+
global_embedding_norm: True
50+
embedding_mean_norm: True
51+
embedding_std_norm: False
52+

examples/voxceleb/sv0/local/data.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
stage=-1
4+
stop_stage=100
5+
6+
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
7+
8+
dir=$1
9+
conf_path=$2
10+
mkdir -p ${dir}
11+
12+
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
13+
# data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
14+
# we should use the local/convert.sh convert m4a to wav
15+
python3 local/data_prepare.py \
16+
--data-dir ${dir} \
17+
--config ${conf_path}
18+
fi

examples/voxceleb/sv0/local/data_prepare.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,57 +14,58 @@
1414
import argparse
1515
import os
1616

17-
import numpy as np
1817
import paddle
18+
from yacs.config import CfgNode
1919

20-
from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb
20+
from paddleaudio.datasets.voxceleb import VoxCeleb
2121
from paddlespeech.s2t.utils.log import Log
2222
from paddlespeech.vector.io.augment import build_augment_pipeline
2323
from paddlespeech.vector.training.seeding import seed_everything
2424

2525
logger = Log(__name__).getlog()
2626

2727

28-
def main(args):
28+
def main(args, config):
2929

3030
# stage0: set the cpu device, all data prepare process will be done in cpu mode
3131
paddle.set_device("cpu")
3232
# set the random seed, it is a must for multiprocess training
33-
seed_everything(args.seed)
33+
seed_everything(config.seed)
3434

3535
# stage 1: generate the voxceleb csv file
3636
# Note: this may occurs c++ execption, but the program will execute fine
3737
# so we ignore the execption
3838
# we explicitly pass the vox2 base path to data prepare and generate the audio info
39+
logger.info("start to generate the voxceleb dataset info")
3940
train_dataset = VoxCeleb(
40-
'train', target_dir=args.data_dir, vox2_base_path=args.vox2_base_path)
41-
dev_dataset = VoxCeleb(
42-
'dev', target_dir=args.data_dir, vox2_base_path=args.vox2_base_path)
41+
'train', target_dir=args.data_dir, vox2_base_path=config.vox2_base_path)
4342

4443
# stage 2: generate the augment noise csv file
45-
if args.augment:
44+
if config.augment:
45+
logger.info("start to generate the augment dataset info")
4646
augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
4747

4848

4949
if __name__ == "__main__":
5050
# yapf: disable
5151
parser = argparse.ArgumentParser(__doc__)
52-
parser.add_argument("--seed",
53-
default=0,
54-
type=int,
55-
help="random seed for paddle, numpy and python random package")
5652
parser.add_argument("--data-dir",
5753
default="./data/",
5854
type=str,
5955
help="data directory")
60-
parser.add_argument("--vox2-base-path",
56+
parser.add_argument("--config",
6157
default=None,
6258
type=str,
63-
help="vox2 base path, where is store the wav audio")
64-
parser.add_argument("--augment",
65-
action="store_true",
66-
default=False,
67-
help="Apply audio augments.")
59+
help="configuration file")
6860
args = parser.parse_args()
6961
# yapf: enable
70-
main(args)
62+
63+
# https://yaml.org/type/float.html
64+
config = CfgNode(new_allowed=True)
65+
if args.config:
66+
config.merge_from_file(args.config)
67+
68+
config.freeze()
69+
print(config)
70+
71+
main(args, config)

examples/voxceleb/sv0/local/emb.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
. ./path.sh
3+
4+
exp_dir=exp/ecapa-tdnn-vox12-big//epoch_10/ # experiment directory
5+
conf_path=conf/ecapa_tdnn.yaml
6+
audio_path="demo/voxceleb/00001.wav"
7+
8+
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
9+
10+
# extract the audio embedding
11+
python3 ${BIN_DIR}/extract_emb.py --device "gpu" \
12+
--config ${conf_path} \
13+
--audio-path ${audio_path} --load-checkpoint ${exp_dir}

examples/voxceleb/sv0/local/test.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
dir=$1
2+
exp_dir=$2
3+
conf_path=$3
4+
5+
python3 ${BIN_DIR}/test.py \
6+
--config ${conf_path} \
7+
--data-dir ${dir} \
8+
--load-checkpoint ${exp_dir}

examples/voxceleb/sv0/local/train.sh

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
3+
dir=$1
4+
exp_dir=$2
5+
conf_path=$3
6+
7+
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
8+
echo "using $ngpu gpus..."
9+
10+
# train the speaker identification task with voxceleb data
11+
# Note: we will store the log file in exp/log directory
12+
python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \
13+
${BIN_DIR}/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \
14+
--data-dir ${dir} --config ${conf_path}
15+
16+
17+
if [ $? -ne 0 ]; then
18+
echo "Failed in training!"
19+
exit 1
20+
fi
21+
22+
exit 0

examples/voxceleb/sv0/run.sh

Lines changed: 20 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ set -e
1818

1919
#######################################################################
2020
# stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
21-
# voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md
21+
# voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md with the script local/convert.sh
2222
# stage 1: train the speaker identification model
2323
# stage 2: test speaker identification
2424
# stage 3: extract the training embeding to train the LDA and PLDA
@@ -30,49 +30,39 @@ set -e
3030
# and put all of them to ${PPAUDIO_HOME}/datasets/vox2
3131
# we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav
3232
# export PPAUDIO_HOME=
33-
3433
stage=0
34+
stop_stage=50
35+
3536
# data directory
3637
# if we set the variable ${dir}, we will store the wav info to this directory
3738
# otherwise, we will store the wav info to vox1 and vox2 directory respectively
38-
dir=data/
39-
exp_dir=exp/ecapa-tdnn/ # experiment directory
40-
4139
# vox2 wav path, we must convert the m4a format to wav format
42-
# and store them in the ${PPAUDIO_HOME}/datasets/vox2/wav/ directory
43-
vox2_base_path=${PPAUDIO_HOME}/datasets/vox2/wav/
44-
mkdir -p ${dir}
40+
# dir=data-demo/ # data info directory
41+
dir=demo/ # data info directory
42+
43+
exp_dir=exp/ecapa-tdnn-vox12-big// # experiment directory
44+
conf_path=conf/ecapa_tdnn.yaml
45+
gpus=0,1,2,3
46+
47+
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
48+
4549
mkdir -p ${exp_dir}
4650

47-
if [ $stage -le 0 ]; then
51+
if [ $stage -le 0 ] && [ ${stop_stage} -ge 0 ]; then
4852
# stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
49-
python3 local/data_prepare.py \
50-
--data-dir ${dir} --augment --vox2-base-path ${vox2_base_path} \
51-
--config conf/ecapa_tdnn.yaml
53+
# and we should specifiy the vox2 data in the data.sh
54+
bash ./local/data.sh ${dir} ${conf_path}|| exit -1;
5255
fi
5356

54-
if [ $stage -le 1 ]; then
57+
if [ $stage -le 1 ] && [ ${stop_stage} -ge 1 ]; then
5558
# stage 1: train the speaker identification model
56-
python3 \
57-
-m paddle.distributed.launch --gpus=0,1,2,3 \
58-
${BIN_DIR}/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \
59-
--data-dir ${dir} --config conf/ecapa_tdnn.yaml
59+
CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${dir} ${exp_dir} ${conf_path}
6060
fi
6161

6262
if [ $stage -le 2 ]; then
63-
# stage 1: get the speaker verification scores with cosine function
64-
python3 \
65-
${BIN_DIR}/speaker_verification_cosine.py\
66-
--config conf/ecapa_tdnn.yaml \
67-
--data-dir ${dir} --load-checkpoint ${exp_dir}/epoch_10/
68-
fi
69-
70-
if [ $stage -le 3 ]; then
71-
# stage 3: extract the audio embedding
72-
python3 \
73-
${BIN_DIR}/extract_speaker_embedding.py\
74-
--config conf/ecapa_tdnn.yaml \
75-
--audio-path "demo/csv/00001.wav" --load-checkpoint ${exp_dir}/epoch_60/
63+
# stage 2: get the speaker verification scores with cosine function
64+
# now we only support use cosine to get the scores
65+
CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ${dir} ${exp_dir} ${conf_path}
7666
fi
7767

7868
# if [ $stage -le 3 ]; then

paddleaudio/paddleaudio/datasets/rirs_noises.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,10 @@
2525

2626
from ..backends import load as load_audio
2727
from ..backends import save as save_wav
28-
from .dataset import feat_funcs
2928
from ..utils import DATA_HOME
3029
from ..utils import decompress
31-
from paddlespeech.s2t.utils.log import Log
32-
from paddlespeech.vector.utils.download import download_and_decompress
33-
34-
logger = Log(__name__).getlog()
30+
from ..utils.download import download_and_decompress
31+
from .dataset import feat_funcs
3532

3633
__all__ = ['OpenRIRNoise']
3734

@@ -80,17 +77,17 @@ def __init__(self,
8077

8178
def _get_data(self):
8279
# Download audio files.
83-
logger.info(f"rirs noises base path: {self.base_path}")
80+
print(f"rirs noises base path: {self.base_path}")
8481
if not os.path.isdir(self.base_path):
8582
download_and_decompress(
8683
self.archieves, self.base_path, decompress=True)
8784
else:
88-
logger.info(
85+
print(
8986
f"{self.base_path} already exists, we will not download and decompress again"
9087
)
9188

9289
# Data preparation.
93-
logger.info(f"prepare the csv to {self.csv_path}")
90+
print(f"prepare the csv to {self.csv_path}")
9491
if not os.path.isdir(self.csv_path):
9592
os.makedirs(self.csv_path)
9693
self.prepare_data()
@@ -161,7 +158,7 @@ def generate_csv(self,
161158
wav_files: List[str],
162159
output_file: str,
163160
split_chunks: bool=True):
164-
logger.info(f'Generating csv: {output_file}')
161+
print(f'Generating csv: {output_file}')
165162
header = ["id", "duration", "wav"]
166163

167164
infos = list(

0 commit comments

Comments
 (0)