@@ -668,78 +668,82 @@ function llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP() {
668668 case_log_dir=" output/$task_name " " _log"
669669
670670 for to_static in " 0" " 1" ; do
671- rm -rf $case_out_dir
672- rm -rf $case_log_dir
673- python -u -m paddle.distributed.launch \
674- --gpus " 0,1,2,3" \
675- --log_dir $case_log_dir \
676- run_pretrain_auto.py \
677- --model_type " llama" \
678- --model_name_or_path " facebook/llama-7b" \
679- --tokenizer_name_or_path " facebook/llama-7b" \
680- --input_dir " ./data" \
681- --output_dir $case_out_dir \
682- --split 949,50,1 \
683- --weight_decay 0.01 \
684- --warmup_ratio 0.01 \
685- --max_grad_norm 0.0 \
686- --learning_rate 3e-05 \
687- --min_learning_rate 3e-06 \
688- --max_steps 10 \
689- --logging_steps 10 \
690- --eval_steps 1000 \
691- --save_steps 50000 \
692- --continue_training 0 \
693- --do_train true \
694- --do_eval false \
695- --do_predict false \
696- --disable_tqdm true \
697- --skip_profile_timer true \
698- --save_total_limit 2 \
699- --device gpu \
700- --disable_tqdm true \
701- --dataloader_num_workers 1 \
702- --enable_auto_parallel 1 \
703- --per_device_train_batch_size 1 \
704- --gradient_accumulation_steps 1 \
705- --per_device_eval_batch_size 2 \
706- --recompute false \
707- --bf16 1\
708- --fp16_opt_level " O2" \
709- --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
710- --amp_custom_white_list " lookup_table" " lookup_table_v2" \
711- --amp_master_grad 1 \
712- --fuse_attention_ffn false \
713- --fuse_attention_qkv false \
714- --fuse_sequence_parallel_allreduce false \
715- --use_flash_attention 0 \
716- --use_fused_rope false \
717- --use_fused_rms_norm 0 \
718- --max_seq_length 4096 \
719- --sep_parallel_degree 1 \
720- --sequence_parallel true \
721- --pipeline_parallel_degree 1 \
722- --sharding_parallel_degree 1 \
723- --tensor_parallel_degree 2 \
724- --virtual_pp_degree 1 \
725- --sharding " " \
726- --to_static ${to_static} \
727- --num_hidden_layers 4 \
728- >> ${log_path} /$FUNCNAME 2>&1
729- loss=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
730- loss_md5=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss_md5: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
731- ips=-1
732- mem=-1
733- echo " result: to_static=$to_static loss=$loss ips=$ips mem=$mem "
734- loss_base=9.16783295
735- loss_md5_base=8ea72495fba4e1b9ba004b4431e27218
736- if [ $IS_A100 -ne 0 ]; then
737- loss_base=9.38009949
738- fi
739- ips_base=-1
740- mem_base=-1
741- check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
742- # check_md5_result $FUNCNAME ${loss_md5_base} ${loss_md5}
671+ for use_recompute in " 1" " 0" ; do
672+ rm -rf $case_out_dir
673+ rm -rf $case_log_dir
674+ python -u -m paddle.distributed.launch \
675+ --gpus " 0,1,2,3" \
676+ --log_dir $case_log_dir \
677+ run_pretrain_auto.py \
678+ --model_type " llama" \
679+ --model_name_or_path " facebook/llama-7b" \
680+ --tokenizer_name_or_path " facebook/llama-7b" \
681+ --input_dir " ./data" \
682+ --output_dir $case_out_dir \
683+ --split 949,50,1 \
684+ --weight_decay 0.01 \
685+ --warmup_ratio 0.01 \
686+ --max_grad_norm 0.0 \
687+ --learning_rate 3e-05 \
688+ --min_learning_rate 3e-06 \
689+ --max_steps 10 \
690+ --logging_steps 10 \
691+ --eval_steps 1000 \
692+ --save_steps 50000 \
693+ --continue_training 0 \
694+ --do_train true \
695+ --do_eval false \
696+ --do_predict false \
697+ --disable_tqdm true \
698+ --skip_profile_timer true \
699+ --save_total_limit 2 \
700+ --device gpu \
701+ --disable_tqdm true \
702+ --dataloader_num_workers 1 \
703+ --enable_auto_parallel 1 \
704+ --per_device_train_batch_size 1 \
705+ --gradient_accumulation_steps 1 \
706+ --per_device_eval_batch_size 2 \
707+ --recompute ${use_recompute} \
708+ --bf16 1\
709+ --fp16_opt_level " O2" \
710+ --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
711+ --amp_custom_white_list " lookup_table" " lookup_table_v2" \
712+ --amp_master_grad 1 \
713+ --fuse_attention_ffn false \
714+ --fuse_attention_qkv false \
715+ --fuse_sequence_parallel_allreduce false \
716+ --use_flash_attention 0 \
717+ --use_fused_rope false \
718+ --use_fused_rms_norm 0 \
719+ --max_seq_length 4096 \
720+ --sep_parallel_degree 1 \
721+ --sequence_parallel true \
722+ --pipeline_parallel_degree 1 \
723+ --sharding_parallel_degree 1 \
724+ --tensor_parallel_degree 2 \
725+ --virtual_pp_degree 1 \
726+ --sharding " " \
727+ --to_static ${to_static} \
728+ --num_hidden_layers 4 \
729+ >> ${log_path} /$FUNCNAME 2>&1
730+ loss=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
731+ loss_md5=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss_md5: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
732+ ips=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' interval_tokens_per_second_per_device: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
733+ mem=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' max_memory_reserved: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
734+ echo " result: to_static=$to_static loss=$loss ips=$ips mem=$mem "
735+ loss_base=9.16783295
736+ loss_md5_base=8ea72495fba4e1b9ba004b4431e27218
737+ if [ $IS_A100 -ne 0 ]; then
738+ loss_base=9.38009949
739+ fi
740+ ips=-1
741+ mem=-1
742+ ips_base=-1
743+ mem_base=-1
744+ check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
745+ # check_md5_result $FUNCNAME ${loss_md5_base} ${loss_md5}
746+ done
743747 done
744748 echo " =========== $FUNCNAME run end ==========="
745749}
0 commit comments