Skip to content

Commit 748e69d

Browse files
authored
Polish the benchmark scripts and add test for N4C32. (#7105)
* Polish the benchmark scripts and add test for N4C32. * Update the setteing of pipeline_parallel_config. * Add the missing comma and rename all the benchmark filename.
1 parent 48596fe commit 748e69d

9 files changed

+193
-42
lines changed
Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,15 @@
1515

1616
param="model_name_or_path=facebook/llama-13b "
1717
param+="per_device_train_batch_size=2 "
18-
param+="tensor_parallel_degree=2 "
1918
param+="data_parallel_degree=1 "
19+
param+="tensor_parallel_degree=2 "
2020
param+="pipeline_parallel_degree=4 "
2121
param+="virtual_pp_degree=5 "
2222
param+="sequence_parallel=0 "
2323
param+="sharding_parallel_degree=1 "
24-
param+="save_steps=200 "
2524
param+="sharding=stage1 "
2625
param+="recompute=1 "
27-
param+="run_mode=DP1-MP2-PP4-mbs2-acc16-recompute "
26+
param+="run_mode=MP2-PP4-vpp5-mbs2-acc16-recompute "
2827
param+="device_num=N1C8 "
2928
param+="global_batch_size=32 "
3029
param+="model_item=facebook-llama-13b_seqlen2048_pretrain "
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
param="model_name_or_path=facebook/llama-13b "
17+
param+="per_device_train_batch_size=1 "
18+
param+="data_parallel_degree=1 "
19+
param+="tensor_parallel_degree=4 "
20+
param+="pipeline_parallel_degree=2 "
21+
param+="virtual_pp_degree=1 "
22+
param+="sequence_parallel=0 "
23+
param+="sharding_parallel_degree=1 "
24+
param+="sharding=stage1 "
25+
param+="recompute=0 "
26+
param+="run_mode=MP4-PP2-mbs1-acc32 "
27+
param+="device_num=N1C8 "
28+
param+="global_batch_size=32 "
29+
param+="model_item=facebook-llama-13b_seqlen2048_pretrain "
30+
param+="max_steps=150 "
31+
param+="gradient_accumulation_steps=32 "
32+
param+="pp_recompute_interval=1 "
33+
param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add "
34+
param+="recompute_use_reentrant=true "
35+
36+
cd ./tests
37+
bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
38+
39+
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh"
Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,15 @@
1515

1616
param="model_name_or_path=facebook/llama-13b "
1717
param+="per_device_train_batch_size=1 "
18-
param+="tensor_parallel_degree=1 "
1918
param+="data_parallel_degree=1 "
19+
param+="tensor_parallel_degree=1 "
2020
param+="pipeline_parallel_degree=8 "
2121
param+="virtual_pp_degree=5 "
2222
param+="sequence_parallel=0 "
2323
param+="sharding_parallel_degree=1 "
24-
param+="save_steps=200 "
2524
param+="sharding=stage1 "
2625
param+="recompute=1 "
27-
param+="run_mode=DP1-MP1-PP8-mbs1-acc32-recompute "
26+
param+="run_mode=PP8-vpp5-mbs1-acc32-recompute "
2827
param+="device_num=N1C8 "
2928
param+="global_batch_size=32 "
3029
param+="model_item=facebook-llama-13b_seqlen2048_pretrain "
@@ -37,4 +36,4 @@ param+="recompute_use_reentrant=true "
3736
cd ./tests
3837
bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
3938

40-
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh"
39+
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh"
Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,15 @@
1414

1515
param="model_name_or_path=facebook/llama-13b "
1616
param+="per_device_train_batch_size=1 "
17-
param+="tensor_parallel_degree=2 "
1817
param+="data_parallel_degree=1 "
18+
param+="tensor_parallel_degree=2 "
1919
param+="pipeline_parallel_degree=4 "
2020
param+="virtual_pp_degree=1 "
2121
param+="sequence_parallel=0 "
2222
param+="sharding_parallel_degree=2 "
23-
param+="save_steps=200 "
2423
param+="sharding=stage1 "
2524
param+="recompute=0 "
26-
param+="run_mode=DP1-MP2-PP4-VPP1-mbs1-acc32-recompute "
25+
param+="run_mode=MP2-PP4-SD2-stage1-mbs1-acc32 "
2726
param+="device_num=N2C16 "
2827
param+="global_batch_size=64 "
2928
param+="model_item=facebook-llama-13b_seqlen2048_pretrain "
@@ -36,4 +35,4 @@ param+="recompute_use_reentrant=true "
3635
cd ./tests
3736
bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
3837

39-
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh"
38+
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh"
Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,15 @@
1414

1515
param="model_name_or_path=facebook/llama-13b "
1616
param+="per_device_train_batch_size=1 "
17-
param+="tensor_parallel_degree=2 "
1817
param+="data_parallel_degree=1 "
18+
param+="tensor_parallel_degree=2 "
1919
param+="pipeline_parallel_degree=4 "
2020
param+="virtual_pp_degree=2 "
2121
param+="sequence_parallel=0 "
2222
param+="sharding_parallel_degree=2 "
23-
param+="save_steps=200 "
2423
param+="sharding=stage1 "
2524
param+="recompute=0 "
26-
param+="run_mode=DP1-MP2-PP4-VPP2-mbs1-acc32-recompute "
25+
param+="run_mode=MP2-PP4-SD2-stage1-vpp2-mbs1-acc32 "
2726
param+="device_num=N2C16 "
2827
param+="global_batch_size=64 "
2928
param+="model_item=facebook-llama-13b_seqlen2048_pretrain "
@@ -36,4 +35,4 @@ param+="recompute_use_reentrant=true "
3635
cd ./tests
3736
bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
3837

39-
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh"
38+
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
param="model_name_or_path=facebook/llama-13b "
16+
param+="per_device_train_batch_size=1 "
17+
param+="data_parallel_degree=1 "
18+
param+="tensor_parallel_degree=1 "
19+
param+="pipeline_parallel_degree=8 "
20+
param+="virtual_pp_degree=1 "
21+
param+="sequence_parallel=0 "
22+
param+="sharding_parallel_degree=2 "
23+
param+="sharding=stage1 "
24+
param+="recompute=0 "
25+
param+="run_mode=PP8-SD2-stage1-mbs1-acc32 "
26+
param+="device_num=N2C16 "
27+
param+="global_batch_size=64 "
28+
param+="model_item=facebook-llama-13b_seqlen2048_pretrain "
29+
param+="max_steps=150 "
30+
param+="gradient_accumulation_steps=32 "
31+
param+="pp_recompute_interval=1 "
32+
param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add "
33+
param+="recompute_use_reentrant=true "
34+
35+
cd ./tests
36+
bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
37+
38+
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
param="model_name_or_path=facebook/llama-13b "
16+
param+="per_device_train_batch_size=1 "
17+
param+="data_parallel_degree=1 "
18+
param+="tensor_parallel_degree=2 "
19+
param+="pipeline_parallel_degree=4 "
20+
param+="virtual_pp_degree=1 "
21+
param+="sequence_parallel=0 "
22+
param+="sharding_parallel_degree=4 "
23+
param+="sharding=stage1 "
24+
param+="recompute=0 "
25+
param+="run_mode=MP2-PP4-SD4-stage1-mbs1-acc32 "
26+
param+="device_num=N4C32 "
27+
param+="global_batch_size=128 "
28+
param+="model_item=facebook-llama-13b_seqlen2048_pretrain "
29+
param+="max_steps=150 "
30+
param+="gradient_accumulation_steps=32 "
31+
param+="pp_recompute_interval=1 "
32+
param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add "
33+
param+="pipeline_parallel_config=enable_sharding_comm_overlap "
34+
param+="recompute_use_reentrant=true "
35+
36+
cd ./tests
37+
bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
38+
39+
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
param="model_name_or_path=facebook/llama-13b "
16+
param+="per_device_train_batch_size=1 "
17+
param+="data_parallel_degree=1 "
18+
param+="tensor_parallel_degree=1 "
19+
param+="pipeline_parallel_degree=8 "
20+
param+="virtual_pp_degree=1 "
21+
param+="sequence_parallel=0 "
22+
param+="sharding_parallel_degree=4 "
23+
param+="sharding=stage1 "
24+
param+="recompute=0 "
25+
param+="run_mode=PP8-SD4-stage1-mbs1-acc32 "
26+
param+="device_num=N4C32 "
27+
param+="global_batch_size=128 "
28+
param+="model_item=facebook-llama-13b_seqlen2048_pretrain "
29+
param+="max_steps=150 "
30+
param+="gradient_accumulation_steps=32 "
31+
param+="pp_recompute_interval=1 "
32+
param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add "
33+
param+="recompute_use_reentrant=true "
34+
35+
cd ./tests
36+
bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
37+
38+
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh"

tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# limitations under the License.
1616

1717
# Test training benchmark for a model.
18-
# Usage:bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${save_steps} ${sharding} ${recompute} ${run_mode} ${device_num}
18+
# Usage:bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num}
1919
function _set_params(){
2020
model_name_or_path=${model_name_or_path:-"facebook/llama-13b"}
2121
per_device_train_batch_size=${per_device_train_batch_size:-1}
@@ -25,7 +25,6 @@ function _set_params(){
2525
virtual_pp_degree=${virtual_pp_degree:-2}
2626
sequence_parallel=${sequence_parallel:-0}
2727
sharding_parallel_degree=${sharding_parallel_degree:-2}
28-
save_steps=${save_steps:-200}
2928
sharding=${sharding:-"stage1"}
3029
recompute=${recompute:-1}
3130
run_mode=${run_mode:-"DP1-MP2-PP4-mbs1-acc32-recompute"}
@@ -36,6 +35,7 @@ function _set_params(){
3635
gradient_accumulation_steps=${gradient_accumulation_steps:-32}
3736
pp_recompute_interval=${pp_recompute_interval:-1}
3837
tensor_parallel_config=${tensor_parallel_config:-"enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add"}
38+
pipeline_parallel_config=${pipeline_parallel_config:-""}
3939
recompute_use_reentrant=${recompute_use_reentrant:-"true"}
4040

4141
base_batch_size=${global_batch_size}
@@ -92,6 +92,12 @@ function _train(){
9292
use_fp16_cmd="--use_amp true"
9393
fi
9494

95+
if [ "${pipeline_parallel_config}" != "" ]; then
96+
pipeline_parallel_config_args="--pipeline_parallel_config ${pipeline_parallel_config}"
97+
else
98+
pipeline_parallel_config_args=""
99+
fi
100+
95101
use_pure_fp16=False
96102
train_cmd="--model_type llama \
97103
--model_name_or_path ${model_name_or_path} \
@@ -130,7 +136,7 @@ function _train(){
130136
--enable_linear_fused_grad_add true \
131137
--fuse_attention_qkv true \
132138
--fuse_attention_ffn true \
133-
--tensor_parallel_config ${tensor_parallel_config} \
139+
--tensor_parallel_config ${tensor_parallel_config} ${pipeline_parallel_config_args} \
134140
--recompute ${recompute} \
135141
--recompute_use_reentrant ${recompute_use_reentrant} \
136142
--data_cache ./data_cache"
@@ -142,33 +148,28 @@ function _train(){
142148
PADDLE_RANK_OPTION=""
143149
fi
144150
# 以下为通用执行命令,无特殊可不用修改
145-
if [ "N1C2" = ${device_num} ]; then
146-
# sharding case
147-
echo "run run_mode: DP1-MP1-PP1 device_num: N1C2"
151+
case ${device_num} in
152+
N1C1) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
153+
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0 ${PADDLE_RANK_OPTION}\
154+
run_pretrain.py ${train_cmd}"
155+
workerlog_id=0
156+
;;
157+
N1C2) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
148158
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1 ${PADDLE_RANK_OPTION}\
149159
run_pretrain.py ${train_cmd}"
150160
workerlog_id=0
151-
else
152-
# hybrid_parallelism case
153-
case ${run_mode} in
154-
DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1"
155-
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0 ${PADDLE_RANK_OPTION}\
156-
run_pretrain.py ${train_cmd}"
157-
workerlog_id=0
158-
;;
159-
DP1-MP1-PP4|DP1-MP4-PP1) echo "run run_mode: ${run_mode}"
160-
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3 ${PADDLE_RANK_OPTION}\
161-
run_pretrain.py ${train_cmd}"
162-
workerlog_id=0
163-
;;
164-
DP1-MP2-PP4-mbs2-acc16-recompute|DP1-MP1-PP8-mbs1-acc32-recompute|DP1-MP2-PP4-VPP2-mbs1-acc32-recompute|DP1-MP2-PP4-VPP1-mbs1-acc32-recompute) echo "run run_mode: ${run_mode}"
165-
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
166-
run_pretrain.py ${train_cmd}"
167-
workerlog_id=0
168-
;;
169-
*) echo "choose run_mode "; exit 1;
170-
esac
171-
fi
161+
;;
162+
N1C4) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
163+
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3 ${PADDLE_RANK_OPTION}\
164+
run_pretrain.py ${train_cmd}"
165+
workerlog_id=0
166+
;;
167+
*) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
168+
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
169+
run_pretrain.py ${train_cmd}"
170+
workerlog_id=0
171+
;;
172+
esac
172173
cd ../llm/llama
173174
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
174175
python -c "import paddlenlp"

0 commit comments

Comments
 (0)