File tree Expand file tree Collapse file tree 9 files changed +36
-0
lines changed Expand file tree Collapse file tree 9 files changed +36
-0
lines changed Original file line number Diff line number Diff line change @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
26
26
export FLAGS_cudnn_deterministic=True
27
27
fi
28
28
29
+ # default memeory allocator strategy may case gpu training hang
30
+ # for no OOM raised when memory exhaused
31
+ export FLAGS_allocator_strategy=naive_best_fit
32
+
29
33
if [ ${ngpu} == 0 ]; then
30
34
python3 -u ${BIN_DIR} /train.py \
31
35
--ngpu ${ngpu} \
Original file line number Diff line number Diff line change @@ -35,6 +35,10 @@ echo ${ips_config}
35
35
36
36
mkdir -p exp
37
37
38
+ # default memeory allocator strategy may case gpu training hang
39
+ # for no OOM raised when memory exhaused
40
+ export FLAGS_allocator_strategy=naive_best_fit
41
+
38
42
if [ ${ngpu} == 0 ]; then
39
43
python3 -u ${BIN_DIR} /train.py \
40
44
--ngpu ${ngpu} \
Original file line number Diff line number Diff line change @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
26
26
export FLAGS_cudnn_deterministic=True
27
27
fi
28
28
29
+ # default memeory allocator strategy may case gpu training hang
30
+ # for no OOM raised when memory exhaused
31
+ export FLAGS_allocator_strategy=naive_best_fit
32
+
29
33
if [ ${ngpu} == 0 ]; then
30
34
python3 -u ${BIN_DIR} /train.py \
31
35
--ngpu ${ngpu} \
Original file line number Diff line number Diff line change 29
29
# export FLAGS_cudnn_exhaustive_search=true
30
30
# export FLAGS_conv_workspace_size_limit=4000
31
31
32
+ # default memeory allocator strategy may case gpu training hang
33
+ # for no OOM raised when memory exhaused
34
+ export FLAGS_allocator_strategy=naive_best_fit
35
+
32
36
if [ ${ngpu} == 0 ]; then
33
37
python3 -u ${BIN_DIR} /train.py \
34
38
--ngpu ${ngpu} \
Original file line number Diff line number Diff line change @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
26
26
export FLAGS_cudnn_deterministic=True
27
27
fi
28
28
29
+ # default memeory allocator strategy may case gpu training hang
30
+ # for no OOM raised when memory exhaused
31
+ export FLAGS_allocator_strategy=naive_best_fit
32
+
29
33
if [ ${ngpu} == 0 ]; then
30
34
python3 -u ${BIN_DIR} /train.py \
31
35
--ngpu ${ngpu} \
Original file line number Diff line number Diff line change @@ -19,6 +19,10 @@ if [ ${seed} != 0 ]; then
19
19
export FLAGS_cudnn_deterministic=True
20
20
fi
21
21
22
+ # default memeory allocator strategy may case gpu training hang
23
+ # for no OOM raised when memory exhaused
24
+ export FLAGS_allocator_strategy=naive_best_fit
25
+
22
26
if [ ${ngpu} == 0 ]; then
23
27
python3 -u ${BIN_DIR} /train.py \
24
28
--ngpu ${ngpu} \
Original file line number Diff line number Diff line change 32
32
33
33
mkdir -p exp
34
34
35
+ # default memeory allocator strategy may case gpu training hang
36
+ # for no OOM raised when memory exhaused
37
+ export FLAGS_allocator_strategy=naive_best_fit
38
+
35
39
if [ ${ngpu} == 0 ]; then
36
40
python3 -u ${BIN_DIR} /train.py \
37
41
--ngpu ${ngpu} \
Original file line number Diff line number Diff line change 34
34
35
35
mkdir -p exp
36
36
37
+ # default memeory allocator strategy may case gpu training hang
38
+ # for no OOM raised when memory exhaused
39
+ export FLAGS_allocator_strategy=naive_best_fit
40
+
37
41
if [ ${ngpu} == 0 ]; then
38
42
python3 -u ${BIN_DIR} /train.py \
39
43
--ngpu ${ngpu} \
Original file line number Diff line number Diff line change @@ -35,6 +35,10 @@ echo ${ips_config}
35
35
36
36
mkdir -p exp
37
37
38
+ # default memeory allocator strategy may case gpu training hang
39
+ # for no OOM raised when memory exhaused
40
+ export FLAGS_allocator_strategy=naive_best_fit
41
+
38
42
if [ ${ngpu} == 0 ]; then
39
43
python3 -u ${BIN_DIR} /train.py \
40
44
--ngpu ${ngpu} \
You can’t perform that action at this time.
0 commit comments