Skip to content

Commit 9a50647

Browse files
authored
Add XPU support for FastSpeech2 (#3514)
* Add XPU support for FastSpeech2 * optimize
1 parent 08599b7 commit 9a50647

File tree

6 files changed

+349
-3
lines changed

6 files changed

+349
-3
lines changed
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/bin/bash
2+
3+
train_output_path=$1
4+
5+
stage=0
6+
stop_stage=0
7+
8+
# pwgan
9+
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
10+
python3 ${BIN_DIR}/../inference.py \
11+
--inference_dir=${train_output_path}/inference \
12+
--am=fastspeech2_csmsc \
13+
--voc=pwgan_csmsc \
14+
--text=${BIN_DIR}/../../assets/sentences.txt \
15+
--output_dir=${train_output_path}/pd_infer_out \
16+
--phones_dict=dump/phone_id_map.txt \
17+
--device xpu
18+
fi
19+
20+
# for more GAN Vocoders
21+
# multi band melgan
22+
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
23+
python3 ${BIN_DIR}/../inference.py \
24+
--inference_dir=${train_output_path}/inference \
25+
--am=fastspeech2_csmsc \
26+
--voc=mb_melgan_csmsc \
27+
--text=${BIN_DIR}/../../assets/sentences.txt \
28+
--output_dir=${train_output_path}/pd_infer_out \
29+
--phones_dict=dump/phone_id_map.txt \
30+
--device xpu
31+
fi
32+
33+
# hifigan
34+
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
35+
python3 ${BIN_DIR}/../inference.py \
36+
--inference_dir=${train_output_path}/inference \
37+
--am=fastspeech2_csmsc \
38+
--voc=hifigan_csmsc \
39+
--text=${BIN_DIR}/../../assets/sentences.txt \
40+
--output_dir=${train_output_path}/pd_infer_out \
41+
--phones_dict=dump/phone_id_map.txt \
42+
--device xpu
43+
fi
44+
45+
# wavernn
46+
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
47+
python3 ${BIN_DIR}/../inference.py \
48+
--inference_dir=${train_output_path}/inference \
49+
--am=fastspeech2_csmsc \
50+
--voc=wavernn_csmsc \
51+
--text=${BIN_DIR}/../../assets/sentences.txt \
52+
--output_dir=${train_output_path}/pd_infer_out \
53+
--phones_dict=dump/phone_id_map.txt \
54+
--device xpu
55+
fi
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
#!/bin/bash
2+
3+
config_path=$1
4+
train_output_path=$2
5+
ckpt_name=$3
6+
7+
stage=0
8+
stop_stage=0
9+
10+
# pwgan
11+
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
12+
FLAGS_allocator_strategy=naive_best_fit \
13+
python3 ${BIN_DIR}/../synthesize_e2e.py \
14+
--am=fastspeech2_csmsc \
15+
--am_config=${config_path} \
16+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
17+
--am_stat=dump/train/speech_stats.npy \
18+
--voc=pwgan_csmsc \
19+
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
20+
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
21+
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
22+
--lang=zh \
23+
--text=${BIN_DIR}/../../assets/sentences.txt \
24+
--output_dir=${train_output_path}/test_e2e \
25+
--phones_dict=dump/phone_id_map.txt \
26+
--inference_dir=${train_output_path}/inference \
27+
--ngpu=0 \
28+
--nxpu=1
29+
fi
30+
31+
# for more GAN Vocoders
32+
# multi band melgan
33+
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
34+
FLAGS_allocator_strategy=naive_best_fit \
35+
python3 ${BIN_DIR}/../synthesize_e2e.py \
36+
--am=fastspeech2_csmsc \
37+
--am_config=${config_path} \
38+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
39+
--am_stat=dump/train/speech_stats.npy \
40+
--voc=mb_melgan_csmsc \
41+
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
42+
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
43+
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
44+
--lang=zh \
45+
--text=${BIN_DIR}/../../assets/sentences.txt \
46+
--output_dir=${train_output_path}/test_e2e \
47+
--phones_dict=dump/phone_id_map.txt \
48+
--inference_dir=${train_output_path}/inference \
49+
--ngpu=0 \
50+
--nxpu=1
51+
fi
52+
53+
# the pretrained models haven't release now
54+
# style melgan
55+
# style melgan's Dygraph to Static Graph is not ready now
56+
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
57+
FLAGS_allocator_strategy=naive_best_fit \
58+
python3 ${BIN_DIR}/../synthesize_e2e.py \
59+
--am=fastspeech2_csmsc \
60+
--am_config=${config_path} \
61+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
62+
--am_stat=dump/train/speech_stats.npy \
63+
--voc=style_melgan_csmsc \
64+
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
65+
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
66+
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
67+
--lang=zh \
68+
--text=${BIN_DIR}/../../assets/sentences.txt \
69+
--output_dir=${train_output_path}/test_e2e \
70+
--phones_dict=dump/phone_id_map.txt \
71+
--ngpu=0 \
72+
--nxpu=1
73+
# --inference_dir=${train_output_path}/inference
74+
fi
75+
76+
# hifigan
77+
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
78+
echo "in hifigan syn_e2e"
79+
FLAGS_allocator_strategy=naive_best_fit \
80+
python3 ${BIN_DIR}/../synthesize_e2e.py \
81+
--am=fastspeech2_csmsc \
82+
--am_config=${config_path} \
83+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
84+
--am_stat=dump/train/speech_stats.npy \
85+
--voc=hifigan_csmsc \
86+
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
87+
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
88+
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
89+
--lang=zh \
90+
--text=${BIN_DIR}/../../assets/sentences.txt \
91+
--output_dir=${train_output_path}/test_e2e \
92+
--phones_dict=dump/phone_id_map.txt \
93+
--inference_dir=${train_output_path}/inference \
94+
--ngpu=0 \
95+
--nxpu=1
96+
fi
97+
98+
99+
# wavernn
100+
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
101+
echo "in wavernn syn_e2e"
102+
FLAGS_allocator_strategy=naive_best_fit \
103+
python3 ${BIN_DIR}/../synthesize_e2e.py \
104+
--am=fastspeech2_csmsc \
105+
--am_config=${config_path} \
106+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
107+
--am_stat=dump/train/speech_stats.npy \
108+
--voc=wavernn_csmsc \
109+
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
110+
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
111+
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
112+
--lang=zh \
113+
--text=${BIN_DIR}/../../assets/sentences.txt \
114+
--output_dir=${train_output_path}/test_e2e \
115+
--phones_dict=dump/phone_id_map.txt \
116+
--inference_dir=${train_output_path}/inference \
117+
--ngpu=0 \
118+
--nxpu=1
119+
fi
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/bin/bash
2+
3+
config_path=$1
4+
train_output_path=$2
5+
ckpt_name=$3
6+
stage=0
7+
stop_stage=0
8+
9+
# pwgan
10+
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
11+
FLAGS_allocator_strategy=naive_best_fit \
12+
python3 ${BIN_DIR}/../synthesize.py \
13+
--am=fastspeech2_csmsc \
14+
--am_config=${config_path} \
15+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
16+
--am_stat=dump/train/speech_stats.npy \
17+
--voc=pwgan_csmsc \
18+
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
19+
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
20+
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
21+
--test_metadata=dump/test/norm/metadata.jsonl \
22+
--output_dir=${train_output_path}/test \
23+
--phones_dict=dump/phone_id_map.txt \
24+
--ngpu=0 \
25+
--nxpu=1
26+
fi
27+
28+
# for more GAN Vocoders
29+
# multi band melgan
30+
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
31+
FLAGS_allocator_strategy=naive_best_fit \
32+
python3 ${BIN_DIR}/../synthesize.py \
33+
--am=fastspeech2_csmsc \
34+
--am_config=${config_path} \
35+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
36+
--am_stat=dump/train/speech_stats.npy \
37+
--voc=mb_melgan_csmsc \
38+
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
39+
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
40+
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
41+
--test_metadata=dump/test/norm/metadata.jsonl \
42+
--output_dir=${train_output_path}/test \
43+
--phones_dict=dump/phone_id_map.txt \
44+
--ngpu=0 \
45+
--nxpu=1
46+
fi
47+
48+
# style melgan
49+
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
50+
FLAGS_allocator_strategy=naive_best_fit \
51+
python3 ${BIN_DIR}/../synthesize.py \
52+
--am=fastspeech2_csmsc \
53+
--am_config=${config_path} \
54+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
55+
--am_stat=dump/train/speech_stats.npy \
56+
--voc=style_melgan_csmsc \
57+
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
58+
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
59+
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
60+
--test_metadata=dump/test/norm/metadata.jsonl \
61+
--output_dir=${train_output_path}/test \
62+
--phones_dict=dump/phone_id_map.txt \
63+
--ngpu=0 \
64+
--nxpu=1
65+
fi
66+
67+
# hifigan
68+
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
69+
echo "in hifigan syn"
70+
FLAGS_allocator_strategy=naive_best_fit \
71+
python3 ${BIN_DIR}/../synthesize.py \
72+
--am=fastspeech2_csmsc \
73+
--am_config=${config_path} \
74+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
75+
--am_stat=dump/train/speech_stats.npy \
76+
--voc=hifigan_csmsc \
77+
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
78+
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
79+
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
80+
--test_metadata=dump/test/norm/metadata.jsonl \
81+
--output_dir=${train_output_path}/test \
82+
--phones_dict=dump/phone_id_map.txt \
83+
--ngpu=0 \
84+
--nxpu=1
85+
fi
86+
87+
# wavernn
88+
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
89+
echo "in wavernn syn"
90+
FLAGS_allocator_strategy=naive_best_fit \
91+
python3 ${BIN_DIR}/../synthesize.py \
92+
--am=fastspeech2_csmsc \
93+
--am_config=${config_path} \
94+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
95+
--am_stat=dump/train/speech_stats.npy \
96+
--voc=wavernn_csmsc \
97+
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
98+
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
99+
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
100+
--test_metadata=dump/test/norm/metadata.jsonl \
101+
--output_dir=${train_output_path}/test \
102+
--phones_dict=dump/phone_id_map.txt \
103+
--ngpu=0 \
104+
--nxpu=1
105+
fi
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
3+
config_path=$1
4+
train_output_path=$2
5+
6+
python3 ${BIN_DIR}/train.py \
7+
--train-metadata=dump/train/norm/metadata.jsonl \
8+
--dev-metadata=dump/dev/norm/metadata.jsonl \
9+
--config=${config_path} \
10+
--output-dir=${train_output_path} \
11+
--ngpu=0 \
12+
--nxpu=1 \
13+
--phones-dict=dump/phone_id_map.txt

examples/csmsc/tts3/run_xpu.sh

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
3+
set -e
4+
source path.sh
5+
6+
xpus=0,1
7+
stage=0
8+
stop_stage=100
9+
10+
conf_path=conf/default.yaml
11+
train_output_path=exp/default
12+
ckpt_name=snapshot_iter_153.pdz
13+
14+
# with the following command, you can choose the stage range you want to run
15+
# such as `./run.sh --stage 0 --stop-stage 0`
16+
# this can not be mixed use with `$1`, `$2` ...
17+
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
18+
19+
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
20+
# prepare data
21+
./local/preprocess.sh ${conf_path} || exit -1
22+
fi
23+
24+
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
25+
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
26+
FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1
27+
fi
28+
29+
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
30+
# synthesize, vocoder is pwgan by default
31+
FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
32+
fi
33+
34+
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
35+
# synthesize_e2e, vocoder is pwgan by default
36+
FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
37+
fi
38+
39+
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
40+
# inference with static model, vocoder is pwgan by default
41+
FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1
42+
fi

paddlespeech/t2s/exps/fastspeech2/train.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,17 @@
4444
def train_sp(args, config):
4545
# decides device type and whether to run in parallel
4646
# setup running environment correctly
47-
if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
47+
if args.ngpu > 0 and paddle.is_compiled_with_cuda():
48+
paddle.set_device("gpu")
49+
elif args.nxpu > 0 and paddle.is_compiled_with_xpu():
50+
paddle.set_device("xpu")
51+
elif args.ngpu == 0 and args.nxpu == 0:
4852
paddle.set_device("cpu")
4953
else:
50-
paddle.set_device("gpu")
54+
raise ValueError(
55+
"Please make sure that the paddle you installed matches the device type you set, "
56+
"and that ngpu and nxpu cannot be negative at the same time.")
57+
5158
world_size = paddle.distributed.get_world_size()
5259
if world_size > 1:
5360
paddle.distributed.init_parallel_env()
@@ -183,7 +190,12 @@ def main():
183190
parser.add_argument("--dev-metadata", type=str, help="dev data.")
184191
parser.add_argument("--output-dir", type=str, help="output dir.")
185192
parser.add_argument(
186-
"--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
193+
"--ngpu", type=int, default=1, help="if ngpu=0, use cpu or xpu.")
194+
parser.add_argument(
195+
"--nxpu",
196+
type=int,
197+
default=0,
198+
help="if ngpu=0 and nxpu > 0, use xpu. if ngpu=0 and nxpu=0, use cpu.")
187199
parser.add_argument(
188200
"--phones-dict", type=str, default=None, help="phone vocabulary file.")
189201
parser.add_argument(

0 commit comments

Comments
 (0)