Skip to content

Commit e6e72b4

Browse files
authored
Merge pull request #1523 from Honei/vox12
[vector] ecapa-tdnn on voxceleb
2 parents 45cbada + faf6b8d commit e6e72b4

File tree

36 files changed

+4236
-13
lines changed

36 files changed

+4236
-13
lines changed

dataset/voxceleb/voxceleb1.py

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,17 @@
5959
TEST_LIST = {"vox1_test_wav.zip": "185fdc63c3c739954633d50379a3d102"}
6060
TEST_TARGET_DATA = "vox1_test_wav.zip vox1_test_wav.zip 185fdc63c3c739954633d50379a3d102"
6161

62-
# kaldi trial
63-
# this trial file is organized by kaldi according the official file,
64-
# which is a little different with the official trial veri_test2.txt
65-
KALDI_BASE_URL = "http://www.openslr.org/resources/49/"
66-
TRIAL_LIST = {"voxceleb1_test_v2.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7"}
67-
TRIAL_TARGET_DATA = "voxceleb1_test_v2.txt voxceleb1_test_v2.txt 29fc7cc1c5d59f0816dc15d6e8be60f7"
62+
# voxceleb trial
63+
64+
TRIAL_BASE_URL = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/"
65+
TRIAL_LIST = {
66+
"veri_test.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7", # voxceleb1
67+
"veri_test2.txt": "b73110731c9223c1461fe49cb48dddfc", # voxceleb1(cleaned)
68+
"list_test_hard.txt": "21c341b6b2168eea2634df0fb4b8fff1", # voxceleb1-H
69+
"list_test_hard2.txt": "857790e09d579a68eb2e339a090343c8", # voxceleb1-H(cleaned)
70+
"list_test_all.txt": "b9ecf7aa49d4b656aa927a8092844e4a", # voxceleb1-E
71+
"list_test_all2.txt": "a53e059deb562ffcfc092bf5d90d9f3a" # voxceleb1-E(cleaned)
72+
}
6873

6974
parser = argparse.ArgumentParser(description=__doc__)
7075
parser.add_argument(
@@ -82,7 +87,7 @@
8287

8388

8489
def create_manifest(data_dir, manifest_path_prefix):
85-
print("Creating manifest %s ..." % manifest_path_prefix)
90+
print(f"Creating manifest {manifest_path_prefix} from {data_dir}")
8691
json_lines = []
8792
data_path = os.path.join(data_dir, "wav", "**", "*.wav")
8893
total_sec = 0.0
@@ -114,6 +119,9 @@ def create_manifest(data_dir, manifest_path_prefix):
114119
# voxceleb1 is given explicit in the path
115120
data_dir_name = Path(data_dir).name
116121
manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
122+
if not os.path.exists(os.path.dirname(manifest_path_prefix)):
123+
os.makedirs(os.path.dirname(manifest_path_prefix))
124+
117125
with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
118126
for line in json_lines:
119127
f.write(line + "\n")
@@ -133,11 +141,13 @@ def create_manifest(data_dir, manifest_path_prefix):
133141
def prepare_dataset(base_url, data_list, target_dir, manifest_path,
134142
target_data):
135143
if not os.path.exists(target_dir):
136-
os.mkdir(target_dir)
144+
os.makedirs(target_dir)
137145

138146
# wav directory already exists, it need do nothing
147+
# we will download the voxceleb1 data to ${target_dir}/vox1/dev/ or ${target_dir}/vox1/test directory
139148
if not os.path.exists(os.path.join(target_dir, "wav")):
140149
# download all dataset part
150+
print("start to download the vox1 dev zip package")
141151
for zip_part in data_list.keys():
142152
download_url = " --no-check-certificate " + base_url + "/" + zip_part
143153
download(
@@ -166,25 +176,42 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path,
166176
# create the manifest file
167177
create_manifest(data_dir=target_dir, manifest_path_prefix=manifest_path)
168178

179+
def prepare_trial(base_url, data_list, target_dir):
180+
if not os.path.exists(target_dir):
181+
os.makedirs(target_dir)
169182

183+
for trial, md5sum in data_list.items():
184+
target_trial = os.path.join(target_dir, trial)
185+
if not os.path.exists(os.path.join(target_dir, trial)):
186+
download_url = " --no-check-certificate " + base_url + "/" + trial
187+
download(url=download_url, md5sum=md5sum, target_dir=target_dir)
170188
def main():
171189
if args.target_dir.startswith('~'):
172190
args.target_dir = os.path.expanduser(args.target_dir)
173-
191+
192+
# prepare the vox1 dev data
174193
prepare_dataset(
175194
base_url=BASE_URL,
176195
data_list=DEV_LIST,
177196
target_dir=os.path.join(args.target_dir, "dev"),
178197
manifest_path=args.manifest_prefix,
179198
target_data=DEV_TARGET_DATA)
180199

200+
# prepare the vox1 test data
181201
prepare_dataset(
182202
base_url=BASE_URL,
183203
data_list=TEST_LIST,
184204
target_dir=os.path.join(args.target_dir, "test"),
185205
manifest_path=args.manifest_prefix,
186206
target_data=TEST_TARGET_DATA)
187207

208+
# prepare the vox1 trial
209+
prepare_trial(
210+
base_url=TRIAL_BASE_URL,
211+
data_list=TRIAL_LIST,
212+
target_dir=os.path.dirname(args.manifest_prefix)
213+
)
214+
188215
print("Manifest prepare done!")
189216

190217

dataset/voxceleb/voxceleb2.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Prepare VoxCeleb2 dataset
15+
16+
Download and unpack the voxceleb2 data files.
17+
Voxceleb2 data is stored as the m4a format,
18+
so we need convert the m4a to wav with the convert.sh scripts
19+
"""
20+
import argparse
21+
import codecs
22+
import glob
23+
import json
24+
import os
25+
import subprocess
26+
from pathlib import Path
27+
28+
import soundfile
29+
30+
from utils.utility import check_md5sum
31+
from utils.utility import download
32+
from utils.utility import unzip
33+
34+
# all the data will be download in the current data/voxceleb directory default
35+
DATA_HOME = os.path.expanduser('.')
36+
37+
BASE_URL = "--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/"
38+
39+
# dev data
40+
DEV_DATA_URL = BASE_URL + '/vox2_aac.zip'
41+
DEV_MD5SUM = "bbc063c46078a602ca71605645c2a402"
42+
43+
44+
# test data
45+
TEST_DATA_URL = BASE_URL + '/vox2_test_aac.zip'
46+
TEST_MD5SUM = "0d2b3ea430a821c33263b5ea37ede312"
47+
48+
parser = argparse.ArgumentParser(description=__doc__)
49+
parser.add_argument(
50+
"--target_dir",
51+
default=DATA_HOME + "/voxceleb2/",
52+
type=str,
53+
help="Directory to save the voxceleb1 dataset. (default: %(default)s)")
54+
parser.add_argument(
55+
"--manifest_prefix",
56+
default="manifest",
57+
type=str,
58+
help="Filepath prefix for output manifests. (default: %(default)s)")
59+
parser.add_argument("--download",
60+
default=False,
61+
action="store_true",
62+
help="Download the voxceleb2 dataset. (default: %(default)s)")
63+
parser.add_argument("--generate",
64+
default=False,
65+
action="store_true",
66+
help="Generate the manifest files. (default: %(default)s)")
67+
68+
args = parser.parse_args()
69+
70+
71+
def create_manifest(data_dir, manifest_path_prefix):
72+
print("Creating manifest %s ..." % manifest_path_prefix)
73+
json_lines = []
74+
data_path = os.path.join(data_dir, "**", "*.wav")
75+
total_sec = 0.0
76+
total_text = 0.0
77+
total_num = 0
78+
speakers = set()
79+
for audio_path in glob.glob(data_path, recursive=True):
80+
audio_id = "-".join(audio_path.split("/")[-3:])
81+
utt2spk = audio_path.split("/")[-3]
82+
duration = soundfile.info(audio_path).duration
83+
text = ""
84+
json_lines.append(
85+
json.dumps(
86+
{
87+
"utt": audio_id,
88+
"utt2spk": str(utt2spk),
89+
"feat": audio_path,
90+
"feat_shape": (duration, ),
91+
"text": text # compatible with asr data format
92+
},
93+
ensure_ascii=False))
94+
95+
total_sec += duration
96+
total_text += len(text)
97+
total_num += 1
98+
speakers.add(utt2spk)
99+
100+
# data_dir_name refer to dev or test
101+
# voxceleb2 is given explicit in the path
102+
data_dir_name = Path(data_dir).name
103+
manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
104+
105+
if not os.path.exists(os.path.dirname(manifest_path_prefix)):
106+
os.makedirs(os.path.dirname(manifest_path_prefix))
107+
with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
108+
for line in json_lines:
109+
f.write(line + "\n")
110+
111+
manifest_dir = os.path.dirname(manifest_path_prefix)
112+
meta_path = os.path.join(manifest_dir, "voxceleb2." +
113+
data_dir_name) + ".meta"
114+
with codecs.open(meta_path, 'w', encoding='utf-8') as f:
115+
print(f"{total_num} utts", file=f)
116+
print(f"{len(speakers)} speakers", file=f)
117+
print(f"{total_sec / (60 * 60)} h", file=f)
118+
print(f"{total_text} text", file=f)
119+
print(f"{total_text / total_sec} text/sec", file=f)
120+
print(f"{total_sec / total_num} sec/utt", file=f)
121+
122+
123+
def download_dataset(url, md5sum, target_dir, dataset):
124+
if not os.path.exists(target_dir):
125+
os.makedirs(target_dir)
126+
127+
# wav directory already exists, it need do nothing
128+
print("target dir {}".format(os.path.join(target_dir, dataset)))
129+
# unzip the dev dataset will create the dev and unzip the m4a to dev dir
130+
# but the test dataset will unzip to aac
131+
# so, wo create the ${target_dir}/test and unzip the m4a to test dir
132+
if not os.path.exists(os.path.join(target_dir, dataset)):
133+
filepath = download(url, md5sum, target_dir)
134+
if dataset == "test":
135+
unzip(filepath, os.path.join(target_dir, "test"))
136+
137+
138+
def main():
139+
if args.target_dir.startswith('~'):
140+
args.target_dir = os.path.expanduser(args.target_dir)
141+
142+
# download and unpack the vox2-dev data
143+
print("download: {}".format(args.download))
144+
if args.download:
145+
download_dataset(
146+
url=DEV_DATA_URL,
147+
md5sum=DEV_MD5SUM,
148+
target_dir=args.target_dir,
149+
dataset="dev")
150+
151+
download_dataset(
152+
url=TEST_DATA_URL,
153+
md5sum=TEST_MD5SUM,
154+
target_dir=args.target_dir,
155+
dataset="test")
156+
157+
print("VoxCeleb2 download is done!")
158+
159+
if args.generate:
160+
create_manifest(args.target_dir, manifest_path_prefix=args.manifest_prefix)
161+
162+
if __name__ == '__main__':
163+
main()

examples/voxceleb/README.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,51 @@ sv0 - speaker verfication with softmax backend etc, all python code
66

77
sv1 - dependence on kaldi, speaker verfication with plda/sc backend,
88
more info refer to the sv1/readme.txt
9+
10+
11+
## VoxCeleb2 preparation
12+
13+
VoxCeleb2 audio files are released in m4a format. All the VoxCeleb2 m4a audio files must be converted in wav files before feeding them in PaddleSpeech.
14+
Please, follow these steps to prepare the dataset correctly:
15+
16+
1. Download Voxceleb2.
17+
You can find download instructions here: http://www.robots.ox.ac.uk/~vgg/data/voxceleb/
18+
19+
2. Convert .m4a to wav
20+
VoxCeleb2 stores files with the m4a audio format. To use them in PaddleSpeech, you have to convert all the m4a audio files into wav files.
21+
22+
``` shell
23+
ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s
24+
```
25+
26+
You can do the conversion using ffmpeg https://gist.github.com/seungwonpark/4f273739beef2691cd53b5c39629d830). This operation might take several hours and should be only once.
27+
28+
3. Put all the wav files in a folder called `wav`. You should have something like `voxceleb2/wav/id*/*.wav` (e.g, `voxceleb2/wav/id00012/21Uxsk56VDQ/00001.wav`)
29+
30+
31+
## voxceleb dataset summary
32+
33+
34+
|dataset | vox1 - dev | vox1 - test |vox2 - dev| vox2 - test|
35+
|---------|-----------|------------|-----------|----------|
36+
|spks | 1211 |40 | 5994 | 118|
37+
|utts | 148642 | 4874 | 1092009 |36273|
38+
| time(h) | 340.4 | 11.2 | 2360.2 |79.9 |
39+
40+
41+
## trial summary
42+
43+
| trial | filename | nums | positive | negative |
44+
|--------|-----------|--------|-------|------|
45+
| VoxCeleb1 | veri_test.txt | 37720 | 18860 | 18860 |
46+
| VoxCeleb1(cleaned) | veri_test2.txt | 37611 | 18802 | 18809 |
47+
| VoxCeleb1-H | list_test_hard.txt | 552536 | 276270 | 276266 |
48+
|VoxCeleb1-H(cleaned) |list_test_hard2.txt | 550894 | 275488 | 275406 |
49+
|VoxCeleb1-E | list_test_all.txt | 581480 | 290743 | 290737 |
50+
|VoxCeleb1-E(cleaned) | list_test_all2.txt |579818 |289921 |289897 |
51+
52+
53+
54+
55+
56+
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
###########################################
2+
# Data #
3+
###########################################
4+
# we should explicitly specify the wav path of vox2 audio data converted from m4a
5+
vox2_base_path:
6+
augment: True
7+
batch_size: 16
8+
num_workers: 2
9+
num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
10+
shuffle: True
11+
random_chunk: True
12+
13+
###########################################################
14+
# FEATURE EXTRACTION SETTING #
15+
###########################################################
16+
# currently, we only support fbank
17+
sr: 16000 # sample rate
18+
n_mels: 80
19+
window_size: 400 #25ms, sample rate 16000, 25 * 16000 / 1000 = 400
20+
hop_size: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
21+
22+
###########################################################
23+
# MODEL SETTING #
24+
###########################################################
25+
# currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml
26+
# if we want use another model, please choose another configuration yaml file
27+
model:
28+
input_size: 80
29+
# "channels": [512, 512, 512, 512, 1536],
30+
channels: [1024, 1024, 1024, 1024, 3072]
31+
kernel_sizes: [5, 3, 3, 3, 1]
32+
dilations: [1, 2, 3, 4, 1]
33+
attention_channels: 128
34+
lin_neurons: 192
35+
36+
###########################################
37+
# Training #
38+
###########################################
39+
seed: 1986 # according from speechbrain configuration
40+
epochs: 10
41+
save_interval: 1
42+
log_interval: 1
43+
learning_rate: 1e-8
44+
45+
46+
###########################################
47+
# Testing #
48+
###########################################
49+
global_embedding_norm: True
50+
embedding_mean_norm: True
51+
embedding_std_norm: False
52+

0 commit comments

Comments
 (0)