Skip to content

Commit a657cc3

Browse files
authored
Merge pull request #2478 from Zth9730/allocator_strategy
[ASR] Chang memory allocator strategy to fix gpu training hang
2 parents 764fa0a + 404708c commit a657cc3

File tree

9 files changed

+36
-0
lines changed

9 files changed

+36
-0
lines changed

examples/aishell/asr0/local/train.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
2626
export FLAGS_cudnn_deterministic=True
2727
fi
2828

29+
# default memeory allocator strategy may case gpu training hang
30+
# for no OOM raised when memory exhaused
31+
export FLAGS_allocator_strategy=naive_best_fit
32+
2933
if [ ${ngpu} == 0 ]; then
3034
python3 -u ${BIN_DIR}/train.py \
3135
--ngpu ${ngpu} \

examples/aishell/asr1/local/train.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ echo ${ips_config}
3535

3636
mkdir -p exp
3737

38+
# default memeory allocator strategy may case gpu training hang
39+
# for no OOM raised when memory exhaused
40+
export FLAGS_allocator_strategy=naive_best_fit
41+
3842
if [ ${ngpu} == 0 ]; then
3943
python3 -u ${BIN_DIR}/train.py \
4044
--ngpu ${ngpu} \

examples/librispeech/asr0/local/train.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
2626
export FLAGS_cudnn_deterministic=True
2727
fi
2828

29+
# default memeory allocator strategy may case gpu training hang
30+
# for no OOM raised when memory exhaused
31+
export FLAGS_allocator_strategy=naive_best_fit
32+
2933
if [ ${ngpu} == 0 ]; then
3034
python3 -u ${BIN_DIR}/train.py \
3135
--ngpu ${ngpu} \

examples/librispeech/asr1/local/train.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ fi
2929
# export FLAGS_cudnn_exhaustive_search=true
3030
# export FLAGS_conv_workspace_size_limit=4000
3131

32+
# default memeory allocator strategy may case gpu training hang
33+
# for no OOM raised when memory exhaused
34+
export FLAGS_allocator_strategy=naive_best_fit
35+
3236
if [ ${ngpu} == 0 ]; then
3337
python3 -u ${BIN_DIR}/train.py \
3438
--ngpu ${ngpu} \

examples/librispeech/asr2/local/train.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
2626
export FLAGS_cudnn_deterministic=True
2727
fi
2828

29+
# default memeory allocator strategy may case gpu training hang
30+
# for no OOM raised when memory exhaused
31+
export FLAGS_allocator_strategy=naive_best_fit
32+
2933
if [ ${ngpu} == 0 ]; then
3034
python3 -u ${BIN_DIR}/train.py \
3135
--ngpu ${ngpu} \

examples/timit/asr1/local/train.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ if [ ${seed} != 0 ]; then
1919
export FLAGS_cudnn_deterministic=True
2020
fi
2121

22+
# default memeory allocator strategy may case gpu training hang
23+
# for no OOM raised when memory exhaused
24+
export FLAGS_allocator_strategy=naive_best_fit
25+
2226
if [ ${ngpu} == 0 ]; then
2327
python3 -u ${BIN_DIR}/train.py \
2428
--ngpu ${ngpu} \

examples/tiny/asr0/local/train.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ fi
3232

3333
mkdir -p exp
3434

35+
# default memeory allocator strategy may case gpu training hang
36+
# for no OOM raised when memory exhaused
37+
export FLAGS_allocator_strategy=naive_best_fit
38+
3539
if [ ${ngpu} == 0 ]; then
3640
python3 -u ${BIN_DIR}/train.py \
3741
--ngpu ${ngpu} \

examples/tiny/asr1/local/train.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ fi
3434

3535
mkdir -p exp
3636

37+
# default memeory allocator strategy may case gpu training hang
38+
# for no OOM raised when memory exhaused
39+
export FLAGS_allocator_strategy=naive_best_fit
40+
3741
if [ ${ngpu} == 0 ]; then
3842
python3 -u ${BIN_DIR}/train.py \
3943
--ngpu ${ngpu} \

examples/wenetspeech/asr1/local/train.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ echo ${ips_config}
3535

3636
mkdir -p exp
3737

38+
# default memeory allocator strategy may case gpu training hang
39+
# for no OOM raised when memory exhaused
40+
export FLAGS_allocator_strategy=naive_best_fit
41+
3842
if [ ${ngpu} == 0 ]; then
3943
python3 -u ${BIN_DIR}/train.py \
4044
--ngpu ${ngpu} \

0 commit comments

Comments
 (0)