Skip to content

Commit 227ec81

Browse files
authored
add benchmark baichuan2 scripts (#8683)
* remove tsinghua pypi * modify gpt dateset addr for benchmark * fix run_benchmark for llama2_70b in auto_parallel * fix autotunner benchmark error and fix llama2 dy2st benchmark * fix benchmark dir because of PR#8627 * Update run_benchmark.sh Update run_benchmark.sh * fix benchmark dir because of PR#8627 * add qwen N4C32 * fix etcd * fix qwen N4C32 for per_device_type * fix benchmark dir and add CUDA_DEVICE_MAX_CONNECTIONS to qwen * add benchmark baichuan2 scripts * fix run time * update baichun2_13b config
1 parent 5e53db1 commit 227ec81

File tree

6 files changed

+313
-0
lines changed

6 files changed

+313
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
param="model_item=baichuan-inc-Baichun2-13b_pretrain "
16+
param+="run_mode=DP1_MP2_PP2_VPP1_Sharding8_Stage1 "
17+
param+="device_num=N4C32 "
18+
param+="global_batch_size=32 "
19+
param+="nnodes=4 "
20+
param+="model_type=baichun2_13b "
21+
22+
cd ./tests
23+
bash ./test_tipc/dygraph/hybrid_parallelism/baichun2/benchmark_common/prepare.sh
24+
25+
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/baichun2/benchmark_common/run_benchmark.sh"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
param="model_item=baichuan-inc-Baichun2-7b_pretrain "
16+
param+="run_mode=DP1_MP2_PP1_VPP1_Sharding16_Stage1 "
17+
param+="device_num=N4C32 "
18+
param+="global_batch_size=32 "
19+
param+="nnodes=4 "
20+
param+="model_type=baichun2_7b "
21+
22+
cd ./tests
23+
bash ./test_tipc/dygraph/hybrid_parallelism/baichun2/benchmark_common/prepare.sh
24+
25+
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/baichun2/benchmark_common/run_benchmark.sh"
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
{
2+
"model_name_or_path": "baichuan-inc/Baichuan2-13B-Base",
3+
"tokenizer_name_or_path": "baichuan-inc/Baichuan2-13B-Base",
4+
"input_dir": "./data",
5+
"output_dir": "./output/baichun2-13b_pretrain_ckpts",
6+
"split": "949,50,1",
7+
"max_seq_length": 4096,
8+
"gradient_accumulation_steps": 4,
9+
"tensor_parallel_degree": 2,
10+
"pipeline_parallel_degree": 2,
11+
"virtual_pp_degree": 1,
12+
"sequence_parallel": 1,
13+
"sharding_parallel_degree": 8,
14+
"sharding": "stage1",
15+
"pipeline_parallel_config": "enable_delay_scale_loss enable_sharding_comm_overlap enable_release_grads ",
16+
"tensor_parallel_config": "enable_delay_scale_loss enable_mp_async_allreduce enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add",
17+
"per_device_train_batch_size": 1,
18+
"use_flash_attention": true,
19+
"use_fused_rms_norm": true,
20+
"fuse_attention_qkv": true,
21+
"use_fused_rope": true,
22+
"fuse_attention_ffn": true,
23+
"enable_linear_fused_grad_add": true,
24+
"bf16": true,
25+
"fp16_opt_level": "O2",
26+
"scale_loss": 1024,
27+
"learning_rate": 1e-05,
28+
"min_learning_rate": 5e-06,
29+
"max_steps": 100,
30+
"save_steps": 5000,
31+
"weight_decay": 0.01,
32+
"warmup_ratio": 0.01,
33+
"max_grad_norm": 1.0,
34+
"logging_steps": 1,
35+
"dataloader_num_workers": 1,
36+
"eval_steps": 1000,
37+
"disable_tqdm": true,
38+
"continue_training": 0,
39+
"recompute": false,
40+
"recompute_granularity": "full_attn",
41+
"do_train": true,
42+
"pp_recompute_interval": 1,
43+
"device": "gpu",
44+
"amp_master_grad": true,
45+
"sharding_parallel_config": "split_param enable_stage1_overlap"
46+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
{
2+
"model_name_or_path": "baichuan-inc/Baichuan2-7B-Base",
3+
"tokenizer_name_or_path": "baichuan-inc/Baichuan2-7B-Base",
4+
"input_dir": "./data",
5+
"output_dir": "./output/baichun2-7b_pretrain_ckpts",
6+
"split": "949,50,1",
7+
"max_seq_length": 4096,
8+
"gradient_accumulation_steps": 1,
9+
"tensor_parallel_degree": 2,
10+
"pipeline_parallel_degree": 1,
11+
"virtual_pp_degree": 1,
12+
"sequence_parallel": 1,
13+
"sharding_parallel_degree": 16,
14+
"sharding": "stage1",
15+
"pipeline_parallel_config": "enable_delay_scale_loss enable_sharding_comm_overlap enable_release_grads ",
16+
"tensor_parallel_config": "enable_delay_scale_loss enable_mp_async_allreduce enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add",
17+
"per_device_train_batch_size": 2,
18+
"use_flash_attention": true,
19+
"use_fused_rms_norm": true,
20+
"fuse_attention_qkv": true,
21+
"use_fused_rope": true,
22+
"fuse_attention_ffn": true,
23+
"enable_linear_fused_grad_add": true,
24+
"bf16": true,
25+
"fp16_opt_level": "O2",
26+
"scale_loss": 1024,
27+
"learning_rate": 1e-05,
28+
"min_learning_rate": 5e-06,
29+
"max_steps": 500,
30+
"save_steps": 5000,
31+
"weight_decay": 0.01,
32+
"warmup_ratio": 0.01,
33+
"max_grad_norm": 1.0,
34+
"logging_steps": 1,
35+
"dataloader_num_workers": 1,
36+
"eval_steps": 1000,
37+
"disable_tqdm": true,
38+
"continue_training": 0,
39+
"recompute": false,
40+
"recompute_granularity": "full_attn",
41+
"do_train": true,
42+
"pp_recompute_interval": 1,
43+
"device": "gpu",
44+
"amp_master_grad": true,
45+
"sharding_parallel_config": "split_param enable_stage1_overlap"
46+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
python -m pip install -r ../requirements.txt
16+
python -m pip install -r ../requirements-dev.txt
17+
18+
# install fused_ln custom ops
19+
cd ../legacy/model_zoo/gpt-3/external_ops/
20+
python setup.py install
21+
cd -
22+
23+
python -m pip install tiktoken
24+
25+
# install tool_helpers
26+
cd ../llm/
27+
mkdir data
28+
cd data
29+
rm -rf *
30+
# download data
31+
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
32+
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
33+
cd -
34+
35+
# mv autoconfig
36+
rm -rf auto_config_*
37+
cp -r ../tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_* ./
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# Test training benchmark for a model.
18+
# Usage:bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num}
19+
function _set_params(){
20+
model_item=${model_item:-"baichuan-inc-Baichun2-7b_pretrain"}
21+
run_mode=${run_mode:-"MP2-PP1"}
22+
device_num=${device_num:-"N1C8"}
23+
global_batch_size=${global_batch_size:-64}
24+
fp_item="bf16"
25+
MODEL_TYPE=${model_type:-"baichun2_7b"}
26+
27+
ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' '))
28+
master_ip=${ip_lists[0]}
29+
nnodes=${nnodes:-1}
30+
31+
base_batch_size=${global_batch_size}
32+
profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递
33+
model_repo="PaddleNLP" # (必选) 模型套件的名字
34+
speed_unit="tokens/s" # (必选)速度指标单位
35+
skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step
36+
keyword="interval_tokens_per_second_per_device:" # (必选)解析日志,筛选出性能数据所在行的关键字
37+
convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
38+
model_mode=5 # 获取ips数据及单位,仅跳过skip_steps后计算均值,单位保持token/s不变
39+
40+
# 以下为通用执行命令,无特殊可不用修改
41+
model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐
42+
device=${CUDA_VISIBLE_DEVICES//,/ }
43+
arr=(${device})
44+
num_gpu_devices=${#arr[*]}
45+
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量
46+
profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
47+
speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
48+
train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
49+
mkdir -p $(dirname ${train_log_file})
50+
51+
profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
52+
mkdir -p $(dirname ${profiling_log_file})
53+
54+
speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
55+
mkdir -p $(dirname ${speed_log_file})
56+
57+
OUTPUT_PATH=${run_log_path}/output
58+
}
59+
60+
function _train(){
61+
batch_size=${per_device_train_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
62+
63+
if [ -d $OUTPUT_PATH ]; then
64+
rm -rf $OUTPUT_PATH
65+
fi
66+
mkdir $OUTPUT_PATH
67+
68+
echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
69+
70+
if [ ${profiling} == "true" ];then
71+
add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
72+
log_file=${profiling_log_file}
73+
else
74+
add_options=""
75+
log_file=${train_log_file}
76+
fi
77+
78+
if [ ${PADDLE_TRAINER_ID} ]; then
79+
PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
80+
else
81+
PADDLE_RANK_OPTION=""
82+
fi
83+
84+
distributed_args="--master $master_ip:36677 --nnodes $nnodes ${PADDLE_RANK_OPTION} --run_mode=collective"
85+
86+
echo "==========System Env============="
87+
env
88+
echo "================================="
89+
90+
# 以下为通用执行命令,无特殊可不用修改
91+
case ${device_num} in
92+
N1C8) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
93+
train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
94+
--nnodes 1 --nproc_per_node 8 \
95+
--log_dir mylog run_pretrain.py \
96+
./auto_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}-config.json"
97+
;;
98+
N4C32) echo "Run with: device_num=${device_num} run_mode=${run_mode}"
99+
train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
100+
${distributed_args} --log_dir mylog run_pretrain.py \
101+
./auto_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}-config.json"
102+
;;
103+
*) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
104+
train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
105+
${distributed_args} --log_dir mylog run_pretrain.py \
106+
./auto_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}-config.json"
107+
;;
108+
esac
109+
cd ../llm
110+
rm -rf mylog && rm -rf checkpoints
111+
112+
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
113+
timeout 40m ${train_cmd} > ${log_file} 2>&1
114+
115+
if [ $? -ne 0 ];then
116+
echo -e "${model_name}, FAIL"
117+
else
118+
echo -e "${model_name}, SUCCESS"
119+
fi
120+
121+
#kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
122+
if [ ${device_num} != "N1C1" -a -d mylog ]; then
123+
case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog
124+
cp -r ${case_path}/mylog/workerlog.* ./mylog/
125+
fi
126+
}
127+
128+
export FLAGS_selected_gpus="0,1,2,3,4,5,6,7"
129+
export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
130+
131+
source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
132+
_set_params $@
133+
#_train # 如果只产出训练log,不解析,可取消注释
134+
_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开

0 commit comments

Comments
 (0)