Skip to content

Commit e2b588d

Browse files
authored
[examples] run all examples with torchrun (#2021)
1 parent cafe1ce commit e2b588d

File tree

21 files changed

+146
-485
lines changed

21 files changed

+146
-485
lines changed

examples/aishell/NST/run_nst.sh

Lines changed: 9 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,16 @@
2323
# Use this to control how many gpu you use, It's 1-gpu training if you specify
2424
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
2525
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
26-
# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
27-
# communication. More details can be found in
28-
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
29-
# export NCCL_SOCKET_IFNAME=ens4f1
30-
export NCCL_DEBUG=INFO
26+
3127
stage=1 # start from 0 if you need to start from data preparation
3228
stop_stage=8
3329

30+
# You should change the following two parameters for multiple machine training,
31+
# see https://pytorch.org/docs/stable/elastic/run.html
32+
HOST_NODE_ADDR="localhost:0"
33+
num_nodes=1
34+
35+
3436
# here are extra parameters used in NST
3537
cer_out_dir=""
3638
dir=""
@@ -61,15 +63,6 @@ cer_hypo_dir="wenet_cer_hypo"
6163
cer_label_dir="wenet_cer_label"
6264
pseudo_data_ratio=0.75
6365

64-
# The num of machines(nodes) for multi-machine training, 1 is for one machine.
65-
# NFS is required if num_nodes > 1.
66-
67-
num_nodes=1
68-
69-
# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`.
70-
# You should set the node_ranHk=0 on the first machine, set the node_rank=1
71-
# on the second machine, and so on.
72-
node_rank=0
7366
dict=data/dict/lang_char.txt
7467

7568
# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
@@ -119,9 +112,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
119112
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
120113
# Use "nccl" if it works, otherwise use "gloo"
121114
dist_backend="gloo"
122-
world_size=`expr $num_gpus \* $num_nodes`
123-
echo "total gpus is: $world_size"
124-
125115
# the global_cmvn file need to be calculated by combining both supervised/unsupervised datasets,
126116
# and it should be positioned at data/${train_set}/global_cmvn .
127117
cmvn_opts=
@@ -132,15 +122,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
132122
# and output dimension, and $dir/train.yaml will be used for inference
133123
# and export.
134124
echo "checkpoint is " ${checkpoint}
135-
for ((i = 0; i < $num_gpus; ++i)); do
136-
{
137-
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
138-
echo "gpu number $i "
139-
# Rank of each gpu/process used for knowing whether it is
140-
# the master of a worker.
141-
142-
rank=`expr $node_rank \* $num_gpus + $i`
143-
python wenet/bin/train.py --gpu $gpu_id \
125+
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
126+
python wenet/bin/train.py \
144127
--config $train_config \
145128
--data_type $data_type \
146129
--symbol_table $dict \
@@ -149,15 +132,10 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
149132
${checkpoint:+--checkpoint $checkpoint} \
150133
--model_dir $dir \
151134
--ddp.init_method $init_method \
152-
--ddp.world_size $world_size \
153-
--ddp.rank $rank \
154135
--ddp.dist_backend $dist_backend \
155136
--num_workers 1 \
156137
$cmvn_opts \
157138
--pin_memory
158-
} &
159-
done
160-
wait
161139
fi
162140

163141
# In stage 2, we get the averaged final checkpoint and calculate the test and dev accuracy

examples/aishell/paraformer/run.sh

Lines changed: 5 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,14 @@
66
# Use this to control how many gpu you use, It's 1-gpu training if you specify
77
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
88
export CUDA_VISIBLE_DEVICES="0,1,2,3"
9-
# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
10-
# communication. More details can be found in
11-
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
12-
# export NCCL_SOCKET_IFNAME=ens4f1
13-
export NCCL_DEBUG=INFO
149
stage=0 # start from 0 if you need to start from data preparation
1510
stop_stage=5
1611

17-
# The num of machines(nodes) for multi-machine training, 1 is for one machine.
18-
# NFS is required if num_nodes > 1.
12+
# You should change the following two parameters for multiple machine training,
13+
# see https://pytorch.org/docs/stable/elastic/run.html
14+
HOST_NODE_ADDR="localhost:0"
1915
num_nodes=1
2016

21-
# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`.
22-
# You should set the node_rank=0 on the first machine, set the node_rank=1
23-
# on the second machine, and so on.
24-
node_rank=0
2517
# The aishell dataset location, please change this to your own path
2618
# make sure of using absolute path. DO-NOT-USE relative path!
2719
data=/export/data/asr-data/OpenSLR/33/
@@ -120,39 +112,26 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
120112
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
121113
# Use "nccl" if it works, otherwise use "gloo"
122114
dist_backend="gloo"
123-
world_size=`expr $num_gpus \* $num_nodes`
124-
echo "total gpus is: $world_size"
125115
cmvn_opts=
126116
$cmvn && cp data/${train_set}/global_cmvn $dir
127117
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
128118

129119
# train.py rewrite $train_config to $dir/train.yaml with model input
130120
# and output dimension, and $dir/train.yaml will be used for inference
131121
# and export.
132-
for ((i = 0; i < $num_gpus; ++i)); do
133-
{
134-
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
135-
# Rank of each gpu/process used for knowing whether it is
136-
# the master of a worker.
137-
rank=`expr $node_rank \* $num_gpus + $i`
138-
python3 wenet/bin/train.py --gpu $gpu_id \
122+
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
123+
python3 wenet/bin/train.py \
139124
--config $train_config \
140125
--data_type $data_type \
141126
--symbol_table $dict \
142127
--train_data data/$train_set/data.list \
143128
--cv_data data/test/data.list \
144129
${checkpoint:+--checkpoint $checkpoint} \
145130
--model_dir $dir \
146-
--ddp.init_method $init_method \
147-
--ddp.world_size $world_size \
148-
--ddp.rank $rank \
149131
--ddp.dist_backend $dist_backend \
150132
--num_workers 8 \
151133
$cmvn_opts \
152134
--pin_memory
153-
} &
154-
done
155-
wait
156135
fi
157136

158137
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then

examples/aishell/rnnt/run.sh

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,11 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
1212
stage=0 # start from 0 if you need to start from data preparation
1313
stop_stage=5
1414

15-
# The num of machines(nodes) for multi-machine training, 1 is for one machine.
16-
# NFS is required if num_nodes > 1.
15+
# You should change the following two parameters for multiple machine training,
16+
# see https://pytorch.org/docs/stable/elastic/run.html
17+
HOST_NODE_ADDR="localhost:0"
1718
num_nodes=1
1819

19-
# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`.
20-
# You should set the node_rank=0 on the first machine, set the node_rank=1
21-
# on the second machine, and so on.
22-
node_rank=0
2320
# The aishell dataset location, please change this to your own path
2421
# make sure of using absolute path. DO-NOT-USE relatvie path!
2522
data=/export/data/asr-data/OpenSLR/33/
@@ -112,22 +109,15 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
112109
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
113110
# Use "nccl" if it works, otherwise use "gloo"
114111
dist_backend="gloo"
115-
world_size=`expr $num_gpus \* $num_nodes`
116-
echo "total gpus is: $world_size"
117112
cmvn_opts=
118113
$cmvn && cp data/${train_set}/global_cmvn $dir
119114
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
120115

121116
# train.py rewrite $train_config to $dir/train.yaml with model input
122117
# and output dimension, and $dir/train.yaml will be used for inference
123118
# and export.
124-
for ((i = 0; i < $num_gpus; ++i)); do
125-
{
126-
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
127-
# Rank of each gpu/process used for knowing whether it is
128-
# the master of a worker.
129-
rank=`expr $node_rank \* $num_gpus + $i`
130-
python wenet/bin/train.py --gpu $gpu_id \
119+
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
120+
python wenet/bin/train.py \
131121
--config $train_config \
132122
--data_type $data_type \
133123
--symbol_table $dict \
@@ -136,15 +126,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
136126
${checkpoint:+--checkpoint $checkpoint} \
137127
--model_dir $dir \
138128
--ddp.init_method $init_method \
139-
--ddp.world_size $world_size \
140-
--ddp.rank $rank \
141129
--ddp.dist_backend $dist_backend \
142130
--num_workers 1 \
143131
$cmvn_opts \
144132
--pin_memory
145-
} &
146-
done
147-
wait
148133
fi
149134

150135
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then

examples/aishell2/rnnt/run.sh

Lines changed: 17 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
1111

1212
stage=0 # start from 0 if you need to start from data preparation
1313
stop_stage=5
14-
# The num of nodes or machines used for multi-machine training
15-
# Default 1 for single machine/node
16-
# NFS will be needed if you want run multi-machine training
14+
# You should change the following two parameters for multiple machine training,
15+
# see https://pytorch.org/docs/stable/elastic/run.html
16+
HOST_NODE_ADDR="localhost:0"
1717
num_nodes=1
18-
# The rank of each node or machine, range from 0 to num_nodes -1
19-
# The first node/machine sets node_rank 0, the second one sets node_rank 1
20-
# the third one set node_rank 2, and so on. Default 0
21-
node_rank=0
2218

2319
# modify this to your AISHELL-2 data path
2420
# Note: the evaluation data (dev & test) is available at AISHELL.
@@ -110,43 +106,26 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
110106
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
111107
# Use "nccl" if it works, otherwise use "gloo"
112108
dist_backend="gloo"
113-
#dist_backend="nccl"
114-
# The total number of processes/gpus, so that the master knows
115-
# how many workers to wait for.
116-
# More details about ddp can be found in
117-
# https://pytorch.org/tutorials/intermediate/dist_tuto.html
118-
world_size=`expr $num_gpus \* $num_nodes`
119-
echo "total gpus is: $world_size"
120109
cmvn_opts=
121110
$cmvn && cp data/${train_set}/global_cmvn $dir
122111
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
123112
# train.py will write $train_config to $dir/train.yaml with model input
124113
# and output dimension, train.yaml will be used for inference or model
125114
# export later
126-
for ((i = 0; i < $num_gpus; ++i)); do
127-
{
128-
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
129-
# Rank of each gpu/process used for knowing whether it is
130-
# the master of a worker.
131-
rank=`expr $node_rank \* $num_gpus + $i`
132-
python wenet/bin/train.py --gpu $gpu_id \
133-
--config $train_config \
134-
--data_type raw \
135-
--symbol_table $dict \
136-
--train_data data/$train_set/data.list \
137-
--cv_data data/dev/data.list \
138-
${checkpoint:+--checkpoint $checkpoint} \
139-
--model_dir $dir \
140-
--ddp.init_method $init_method \
141-
--ddp.world_size $world_size \
142-
--ddp.rank $rank \
143-
--ddp.dist_backend $dist_backend \
144-
--num_workers 4 \
145-
$cmvn_opts \
146-
2>&1 | tee -a $dir/train.log || exit 1;
147-
} &
148-
done
149-
wait
115+
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
116+
python wenet/bin/train.py \
117+
--config $train_config \
118+
--data_type raw \
119+
--symbol_table $dict \
120+
--train_data data/$train_set/data.list \
121+
--cv_data data/dev/data.list \
122+
${checkpoint:+--checkpoint $checkpoint} \
123+
--model_dir $dir \
124+
--ddp.init_method $init_method \
125+
--ddp.dist_backend $dist_backend \
126+
--num_workers 4 \
127+
$cmvn_opts \
128+
2>&1 | tee -a $dir/train.log || exit 1;
150129
fi
151130

152131
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then

examples/aishell2/s0/run.sh

Lines changed: 17 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,13 @@
66
# Use this to control how many gpu you use, It's 1-gpu training if you specify
77
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
88
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
9-
# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
10-
# communication. More details can be found in
11-
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
12-
# export NCCL_SOCKET_IFNAME=ens4f1
13-
export NCCL_DEBUG=INFO
149
stage=0 # start from 0 if you need to start from data preparation
1510
stop_stage=6
16-
# The num of nodes or machines used for multi-machine training
17-
# Default 1 for single machine/node
18-
# NFS will be needed if you want run multi-machine training
11+
12+
# You should change the following two parameters for multiple machine training,
13+
# see https://pytorch.org/docs/stable/elastic/run.html
14+
HOST_NODE_ADDR="localhost:0"
1915
num_nodes=1
20-
# The rank of each node or machine, range from 0 to num_nodes -1
21-
# The first node/machine sets node_rank 0, the second one sets node_rank 1
22-
# the third one set node_rank 2, and so on. Default 0
23-
node_rank=0
2416

2517
# modify this to your AISHELL-2 data path
2618
# Note: the evaluation data (dev & test) is available at AISHELL.
@@ -106,41 +98,25 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
10698
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
10799
# Use "nccl" if it works, otherwise use "gloo"
108100
dist_backend="gloo"
109-
# The total number of processes/gpus, so that the master knows
110-
# how many workers to wait for.
111-
# More details about ddp can be found in
112-
# https://pytorch.org/tutorials/intermediate/dist_tuto.html
113-
world_size=`expr $num_gpus \* $num_nodes`
114-
echo "total gpus is: $world_size"
115101
cmvn_opts=
116102
$cmvn && cp data/${train_set}/global_cmvn $dir
117103
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
118104
# train.py will write $train_config to $dir/train.yaml with model input
119105
# and output dimension, train.yaml will be used for inference or model
120106
# export later
121-
for ((i = 0; i < $num_gpus; ++i)); do
122-
{
123-
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
124-
# Rank of each gpu/process used for knowing whether it is
125-
# the master of a worker.
126-
rank=`expr $node_rank \* $num_gpus + $i`
127-
python wenet/bin/train.py --gpu $gpu_id \
128-
--config $train_config \
129-
--data_type raw \
130-
--symbol_table $dict \
131-
--train_data data/$train_set/data.list \
132-
--cv_data data/dev/data.list \
133-
${checkpoint:+--checkpoint $checkpoint} \
134-
--model_dir $dir \
135-
--ddp.init_method $init_method \
136-
--ddp.world_size $world_size \
137-
--ddp.rank $rank \
138-
--ddp.dist_backend $dist_backend \
139-
--num_workers 2 \
140-
$cmvn_opts
141-
} &
142-
done
143-
wait
107+
torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \
108+
python wenet/bin/train.py \
109+
--config $train_config \
110+
--data_type raw \
111+
--symbol_table $dict \
112+
--train_data data/$train_set/data.list \
113+
--cv_data data/dev/data.list \
114+
${checkpoint:+--checkpoint $checkpoint} \
115+
--model_dir $dir \
116+
--ddp.init_method $init_method \
117+
--ddp.dist_backend $dist_backend \
118+
--num_workers 2 \
119+
$cmvn_opts
144120
fi
145121

146122
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then

0 commit comments

Comments
 (0)