|
6 | 6 | # Use this to control how many gpu you use, It's 1-gpu training if you specify |
7 | 7 | # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch |
8 | 8 | export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" |
9 | | -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl |
10 | | -# communication. More details can be found in |
11 | | -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html |
12 | | -# export NCCL_SOCKET_IFNAME=ens4f1 |
13 | | -export NCCL_DEBUG=INFO |
14 | 9 | stage=0 # start from 0 if you need to start from data preparation |
15 | 10 | stop_stage=6 |
16 | | -# The num of nodes or machines used for multi-machine training |
17 | | -# Default 1 for single machine/node |
18 | | -# NFS will be needed if you want run multi-machine training |
| 11 | + |
| 12 | +# You should change the following two parameters for multiple machine training, |
| 13 | +# see https://pytorch.org/docs/stable/elastic/run.html |
| 14 | +HOST_NODE_ADDR="localhost:0" |
19 | 15 | num_nodes=1 |
20 | | -# The rank of each node or machine, range from 0 to num_nodes -1 |
21 | | -# The first node/machine sets node_rank 0, the second one sets node_rank 1 |
22 | | -# the third one set node_rank 2, and so on. Default 0 |
23 | | -node_rank=0 |
24 | 16 |
|
25 | 17 | # modify this to your AISHELL-2 data path |
26 | 18 | # Note: the evaluation data (dev & test) is available at AISHELL. |
@@ -106,41 +98,25 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then |
106 | 98 | num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
107 | 99 | # Use "nccl" if it works, otherwise use "gloo" |
108 | 100 | dist_backend="gloo" |
109 | | - # The total number of processes/gpus, so that the master knows |
110 | | - # how many workers to wait for. |
111 | | - # More details about ddp can be found in |
112 | | - # https://pytorch.org/tutorials/intermediate/dist_tuto.html |
113 | | - world_size=`expr $num_gpus \* $num_nodes` |
114 | | - echo "total gpus is: $world_size" |
115 | 101 | cmvn_opts= |
116 | 102 | $cmvn && cp data/${train_set}/global_cmvn $dir |
117 | 103 | $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" |
118 | 104 | # train.py will write $train_config to $dir/train.yaml with model input |
119 | 105 | # and output dimension, train.yaml will be used for inference or model |
120 | 106 | # export later |
121 | | - for ((i = 0; i < $num_gpus; ++i)); do |
122 | | - { |
123 | | - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) |
124 | | - # Rank of each gpu/process used for knowing whether it is |
125 | | - # the master of a worker. |
126 | | - rank=`expr $node_rank \* $num_gpus + $i` |
127 | | - python wenet/bin/train.py --gpu $gpu_id \ |
128 | | - --config $train_config \ |
129 | | - --data_type raw \ |
130 | | - --symbol_table $dict \ |
131 | | - --train_data data/$train_set/data.list \ |
132 | | - --cv_data data/dev/data.list \ |
133 | | - ${checkpoint:+--checkpoint $checkpoint} \ |
134 | | - --model_dir $dir \ |
135 | | - --ddp.init_method $init_method \ |
136 | | - --ddp.world_size $world_size \ |
137 | | - --ddp.rank $rank \ |
138 | | - --ddp.dist_backend $dist_backend \ |
139 | | - --num_workers 2 \ |
140 | | - $cmvn_opts |
141 | | - } & |
142 | | - done |
143 | | - wait |
| 107 | + torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_endpoint=$HOST_NODE_ADDR \ |
| 108 | + python wenet/bin/train.py \ |
| 109 | + --config $train_config \ |
| 110 | + --data_type raw \ |
| 111 | + --symbol_table $dict \ |
| 112 | + --train_data data/$train_set/data.list \ |
| 113 | + --cv_data data/dev/data.list \ |
| 114 | + ${checkpoint:+--checkpoint $checkpoint} \ |
| 115 | + --model_dir $dir \ |
| 116 | + --ddp.init_method $init_method \ |
| 117 | + --ddp.dist_backend $dist_backend \ |
| 118 | + --num_workers 2 \ |
| 119 | + $cmvn_opts |
144 | 120 | fi |
145 | 121 |
|
146 | 122 | if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then |
|
0 commit comments