Skip to content

Commit af33517

Browse files
committed
add pir llama dp ci test.
1 parent c6f4159 commit af33517

File tree

1 file changed

+64
-0
lines changed

1 file changed

+64
-0
lines changed

scripts/distribute/ci_case_auto.sh

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ function llama_case_list_auto() {
6262
llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2
6363
llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2
6464

65+
llama_dy2st_pir_auto_bs1_fp32_DP2-MP1-PP1
66+
6567
llama_static_auto_recompute_bs8_fp32_DP1-MP1-PP1
6668
llama_static_auto_recompute_bs16_fp32_DP2-MP1-PP1
6769
llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP1
@@ -1751,6 +1753,68 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() {
17511753
echo "=========== $FUNCNAME run end ==========="
17521754
}
17531755

1756+
function llama_dy2st_pir_auto_bs1_fp32_DP2-MP1-PP1() {
1757+
echo "=========== $FUNCNAME run begin ==========="
1758+
export PYTHONPATH=$root_path/:$PYTHONPATH
1759+
export FLAGS_call_stack_level=3
1760+
export NVIDIA_TF32_OVERRIDE=0
1761+
1762+
task_name="llama_dy2st_pir_auto_bs1_fp32_dp2mp1pp1"
1763+
case_out_dir="output/$task_name"
1764+
case_log_dir="output/$task_name""_log"
1765+
rm -rf $case_out_dir
1766+
rm -rf $case_log_dir
1767+
1768+
python -u -m paddle.distributed.launch --gpus "0,1" --log_dir $case_log_dir run_pretrain_auto.py \
1769+
--model_type "llama" \
1770+
--model_name_or_path "facebook/llama-7b" \
1771+
--tokenizer_name_or_path "facebook/llama-7b" \
1772+
--input_dir "./data" \
1773+
--output_dir $case_out_dir \
1774+
--split 949,50,1 \
1775+
--max_seq_length 2048 \
1776+
--per_device_train_batch_size 1 \
1777+
--per_device_eval_batch_size 0 \
1778+
--gradient_accumulation_steps 1 \
1779+
--use_flash_attention 0 \
1780+
--use_fused_rms_norm 0 \
1781+
--fp16 0 \
1782+
--fp16_opt_level "O2" \
1783+
--scale_loss 1024 \
1784+
--pipeline_parallel_degree 1 \
1785+
--tensor_parallel_degree 1 \
1786+
--sharding_parallel_degree 1 \
1787+
--learning_rate 0.0001 \
1788+
--min_learning_rate 0.00001 \
1789+
--max_steps 10 \
1790+
--save_steps 5000000 \
1791+
--weight_decay 0.01 \
1792+
--warmup_ratio 0.01 \
1793+
--logging_steps 1\
1794+
--dataloader_num_workers 1 \
1795+
--sharding "" \
1796+
--eval_steps 1000000 \
1797+
--disable_tqdm true \
1798+
--continue_training 0\
1799+
--recompute 0 \
1800+
--do_train \
1801+
--do_eval 0\
1802+
--device "gpu" \
1803+
--data_impl "mmap" \
1804+
--enable_auto_parallel 1 \
1805+
--max_grad_norm 1.0 \
1806+
--to_static $to_static \
1807+
--num_hidden_layers 2 \
1808+
--hidden_size 1024 \
1809+
--intermediate_size 4096 \
1810+
>>${log_path}/$FUNCNAME 2>&1
1811+
loss=`cat $case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
1812+
ips=-1
1813+
mem=-1
1814+
echo "result: loss=$loss ips=$ips mem=$mem"
1815+
echo "=========== $FUNCNAME run end ==========="
1816+
}
1817+
17541818
function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
17551819
echo "=========== $FUNCNAME run begin ==========="
17561820
export PYTHONPATH=$root_path/:$PYTHONPATH

0 commit comments

Comments
 (0)