@@ -62,6 +62,8 @@ function llama_case_list_auto() {
6262 llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2
6363 llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2
6464
65+ llama_dy2st_pir_auto_bs1_fp32_DP2-MP1-PP1
66+
6567 llama_static_auto_recompute_bs8_fp32_DP1-MP1-PP1
6668 llama_static_auto_recompute_bs16_fp32_DP2-MP1-PP1
6769 llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP1
@@ -1751,6 +1753,68 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() {
17511753 echo " =========== $FUNCNAME run end ==========="
17521754}
17531755
1756+ function llama_dy2st_pir_auto_bs1_fp32_DP2-MP1-PP1() {
1757+ echo " =========== $FUNCNAME run begin ==========="
1758+ export PYTHONPATH=$root_path /:$PYTHONPATH
1759+ export FLAGS_call_stack_level=3
1760+ export NVIDIA_TF32_OVERRIDE=0
1761+
1762+ task_name=" llama_dy2st_pir_auto_bs1_fp32_dp2mp1pp1"
1763+ case_out_dir=" output/$task_name "
1764+ case_log_dir=" output/$task_name " " _log"
1765+ rm -rf $case_out_dir
1766+ rm -rf $case_log_dir
1767+
1768+ python -u -m paddle.distributed.launch --gpus " 0,1" --log_dir $case_log_dir run_pretrain_auto.py \
1769+ --model_type " llama" \
1770+ --model_name_or_path " facebook/llama-7b" \
1771+ --tokenizer_name_or_path " facebook/llama-7b" \
1772+ --input_dir " ./data" \
1773+ --output_dir $case_out_dir \
1774+ --split 949,50,1 \
1775+ --max_seq_length 2048 \
1776+ --per_device_train_batch_size 1 \
1777+ --per_device_eval_batch_size 0 \
1778+ --gradient_accumulation_steps 1 \
1779+ --use_flash_attention 0 \
1780+ --use_fused_rms_norm 0 \
1781+ --fp16 0 \
1782+ --fp16_opt_level " O2" \
1783+ --scale_loss 1024 \
1784+ --pipeline_parallel_degree 1 \
1785+ --tensor_parallel_degree 1 \
1786+ --sharding_parallel_degree 1 \
1787+ --learning_rate 0.0001 \
1788+ --min_learning_rate 0.00001 \
1789+ --max_steps 10 \
1790+ --save_steps 5000000 \
1791+ --weight_decay 0.01 \
1792+ --warmup_ratio 0.01 \
1793+ --logging_steps 1\
1794+ --dataloader_num_workers 1 \
1795+ --sharding " " \
1796+ --eval_steps 1000000 \
1797+ --disable_tqdm true \
1798+ --continue_training 0\
1799+ --recompute 0 \
1800+ --do_train \
1801+ --do_eval 0\
1802+ --device " gpu" \
1803+ --data_impl " mmap" \
1804+ --enable_auto_parallel 1 \
1805+ --max_grad_norm 1.0 \
1806+ --to_static $to_static \
1807+ --num_hidden_layers 2 \
1808+ --hidden_size 1024 \
1809+ --intermediate_size 4096 \
1810+ >> ${log_path} /$FUNCNAME 2>&1
1811+ loss=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1812+ ips=-1
1813+ mem=-1
1814+ echo " result: loss=$loss ips=$ips mem=$mem "
1815+ echo " =========== $FUNCNAME run end ==========="
1816+ }
1817+
17541818function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
17551819 echo " =========== $FUNCNAME run begin ==========="
17561820 export PYTHONPATH=$root_path /:$PYTHONPATH
0 commit comments