@@ -94,7 +94,6 @@ function llama_case_list_auto() {
9494 llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2-VPP3_split_bw
9595 llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2
9696 llama_align_dygraph_dy2st_auto_bs2_bf16_DP2-MP1-PP1
97- llama_pir_auto_recompute_DP2_MP2_PP2
9897 llama_pir_auto_fuse_ffn_attention_qkv_MP2
9998 llama_convert_hybrid_ckpt_to_auto_parallel_bs2_fp32_DP2-MP1-PP1
10099 llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP
@@ -668,180 +667,83 @@ function llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP() {
668667 case_log_dir=" output/$task_name " " _log"
669668
670669 for to_static in " 0" " 1" ; do
671- rm -rf $case_out_dir
672- rm -rf $case_log_dir
673- python -u -m paddle.distributed.launch \
674- --gpus " 0,1,2,3" \
675- --log_dir $case_log_dir \
676- run_pretrain_auto.py \
677- --model_type " llama" \
678- --model_name_or_path " facebook/llama-7b" \
679- --tokenizer_name_or_path " facebook/llama-7b" \
680- --input_dir " ./data" \
681- --output_dir $case_out_dir \
682- --split 949,50,1 \
683- --weight_decay 0.01 \
684- --warmup_ratio 0.01 \
685- --max_grad_norm 0.0 \
686- --learning_rate 3e-05 \
687- --min_learning_rate 3e-06 \
688- --max_steps 10 \
689- --logging_steps 10 \
690- --eval_steps 1000 \
691- --save_steps 50000 \
692- --continue_training 0 \
693- --do_train true \
694- --do_eval false \
695- --do_predict false \
696- --disable_tqdm true \
697- --skip_profile_timer true \
698- --save_total_limit 2 \
699- --device gpu \
700- --disable_tqdm true \
701- --dataloader_num_workers 1 \
702- --enable_auto_parallel 1 \
703- --per_device_train_batch_size 1 \
704- --gradient_accumulation_steps 1 \
705- --per_device_eval_batch_size 2 \
706- --recompute false \
707- --bf16 1\
708- --fp16_opt_level " O2" \
709- --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
710- --amp_custom_white_list " lookup_table" " lookup_table_v2" \
711- --amp_master_grad 1 \
712- --fuse_attention_ffn false \
713- --fuse_attention_qkv false \
714- --fuse_sequence_parallel_allreduce false \
715- --use_flash_attention 0 \
716- --use_fused_rope false \
717- --use_fused_rms_norm 0 \
718- --max_seq_length 4096 \
719- --sep_parallel_degree 1 \
720- --sequence_parallel true \
721- --pipeline_parallel_degree 1 \
722- --sharding_parallel_degree 1 \
723- --tensor_parallel_degree 2 \
724- --virtual_pp_degree 1 \
725- --sharding " " \
726- --to_static ${to_static} \
727- --num_hidden_layers 4 \
728- >> ${log_path} /$FUNCNAME 2>&1
729- loss=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
730- loss_md5=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss_md5: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
731- ips=-1
732- mem=-1
733- echo " result: to_static=$to_static loss=$loss ips=$ips mem=$mem "
734- loss_base=9.16783295
735- loss_md5_base=8ea72495fba4e1b9ba004b4431e27218
736- if [ $IS_A100 -ne 0 ]; then
737- loss_base=9.38009949
738- fi
739- ips_base=-1
740- mem_base=-1
741- check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
742- # check_md5_result $FUNCNAME ${loss_md5_base} ${loss_md5}
743- done
744- echo " =========== $FUNCNAME run end ==========="
745- }
746-
747- function llama_pir_auto_recompute_DP2_MP2_PP2(){
748- echo " =========== $FUNCNAME run begin ==========="
749- export PYTHONPATH=$root_path /:$PYTHONPATH
750- export PYTHONPATH=/paddle/Paddle/build_gpu/python/:$PYTHONPATH
751- export FLAGS_call_stack_level=3
752- export FLAGS_enable_pir_api=1
753- export FLAGS_dynamic_static_unified_comm=1
754- export FLAGS_enable_auto_parallel_align_mode=1
755-
756- export NVIDIA_TF32_OVERRIDE=0
757- export FLAGS_cudnn_deterministic=1
758- export FLAGS_embedding_deterministic=1
759-
760- task_name=" llama_pir_auto_recompute_DP2_MP2_PP2"
761- case_out_dir=" output/$task_name "
762- case_log_dir=" output/$task_name " " _log"
763-
764- loss1=0
765- loss2=0
766-
767- for use_recompute in " 0" " 1" ; do
768- rm -rf $case_out_dir
769- rm -rf $case_log_dir
770- python -u -m paddle.distributed.launch \
771- --gpus " 0,1,2,3,4,5,6,7" \
772- --log_dir $case_log_dir \
773- run_pretrain_auto.py \
774- --model_type " llama" \
775- --model_name_or_path " facebook/llama-7b" \
776- --tokenizer_name_or_path " facebook/llama-7b" \
777- --input_dir " ./data" \
778- --output_dir $case_out_dir \
779- --split 949,50,1 \
780- --to_static true \
781- --pipeline_parallel_degree 2 \
782- --tensor_parallel_degree 2 \
783- --virtual_pp_degree 1 \
784- --pipeline_schedule_mode " VPP" \
785- --weight_decay 0.01 \
786- --warmup_ratio 0.01 \
787- --max_grad_norm 0.0 \
788- --learning_rate 3e-05 \
789- --min_learning_rate 3e-06 \
790- --max_steps 10 \
791- --logging_steps 10 \
792- --eval_steps 10000 \
793- --save_steps 1000 \
794- --continue_training 0 \
795- --do_train true \
796- --do_eval false \
797- --do_predict false \
798- --disable_tqdm true \
799- --save_total_limit 2 \
800- --device gpu \
801- --dataloader_num_workers 4 \
802- --distributed_dataloader 0 \
803- --enable_auto_parallel 1 \
804- --per_device_train_batch_size 1 \
805- --gradient_accumulation_steps 1 \
806- --per_device_eval_batch_size 1 \
807- --recompute ${use_recompute} \
808- --recompute_use_reentrant true \
809- --recompute_granularity full \
810- --pp_recompute_interval 0 \
811- --bf16 true \
812- --fp16_opt_level " O2" \
813- --amp_master_grad true \
814- --fuse_attention_ffn true \
815- --fuse_attention_qkv true \
816- --use_flash_attention false \
817- --use_fused_rope true \
818- --use_fused_rms_norm false \
819- --max_seq_length 4096 \
820- --sequence_parallel false \
821- --sharding " stage1" \
822- --data_parallel_config " enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate " \
823- --sharding_parallel_config " enable_stage1_overlap" \
824- --tensor_parallel_config " enable_mp_async_allreduce" \
825- --pipeline_parallel_config " enable_send_recv_overlap" \
826- --auto_parallel_resume_form_hybrid_parallel true \
827- --num_hidden_layers 4 \
828- >> ${log_path} /$FUNCNAME 2>&1
670+ for use_recompute in " 1" " 0" ; do
671+ rm -rf $case_out_dir
672+ rm -rf $case_log_dir
673+ python -u -m paddle.distributed.launch \
674+ --gpus " 0,1,2,3" \
675+ --log_dir $case_log_dir \
676+ run_pretrain_auto.py \
677+ --model_type " llama" \
678+ --model_name_or_path " facebook/llama-7b" \
679+ --tokenizer_name_or_path " facebook/llama-7b" \
680+ --input_dir " ./data" \
681+ --output_dir $case_out_dir \
682+ --split 949,50,1 \
683+ --weight_decay 0.01 \
684+ --warmup_ratio 0.01 \
685+ --max_grad_norm 0.0 \
686+ --learning_rate 3e-05 \
687+ --min_learning_rate 3e-06 \
688+ --max_steps 10 \
689+ --logging_steps 10 \
690+ --eval_steps 1000 \
691+ --save_steps 50000 \
692+ --continue_training 0 \
693+ --do_train true \
694+ --do_eval false \
695+ --do_predict false \
696+ --disable_tqdm true \
697+ --skip_profile_timer true \
698+ --save_total_limit 2 \
699+ --device gpu \
700+ --disable_tqdm true \
701+ --dataloader_num_workers 1 \
702+ --enable_auto_parallel 1 \
703+ --per_device_train_batch_size 1 \
704+ --gradient_accumulation_steps 1 \
705+ --per_device_eval_batch_size 2 \
706+ --recompute ${use_recompute} \
707+ --bf16 1\
708+ --fp16_opt_level " O2" \
709+ --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
710+ --amp_custom_white_list " lookup_table" " lookup_table_v2" \
711+ --amp_master_grad 1 \
712+ --fuse_attention_ffn false \
713+ --fuse_attention_qkv false \
714+ --fuse_sequence_parallel_allreduce false \
715+ --use_flash_attention 0 \
716+ --use_fused_rope false \
717+ --use_fused_rms_norm 0 \
718+ --max_seq_length 4096 \
719+ --sep_parallel_degree 1 \
720+ --sequence_parallel true \
721+ --pipeline_parallel_degree 1 \
722+ --sharding_parallel_degree 1 \
723+ --tensor_parallel_degree 2 \
724+ --virtual_pp_degree 1 \
725+ --sharding " " \
726+ --to_static ${to_static} \
727+ --num_hidden_layers 4 \
728+ >> ${log_path} /$FUNCNAME 2>&1
829729 loss=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
830730 loss_md5=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss_md5: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
831731 ips=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' interval_tokens_per_second_per_device: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
832732 mem=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' max_memory_reserved: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
833- echo " result: loss= $loss loss_md5= $loss_md5 ips=$ips mem=$mem "
834- if [ $use_recompute -eq 0 ] ; then
835- loss1=( $loss )
836- else
837- loss2=( $loss )
733+ echo " result: to_static= $to_static loss= $loss ips=$ips mem=$mem "
734+ loss_base=9.16783295
735+ loss_md5_base=8ea72495fba4e1b9ba004b4431e27218
736+ if [ $IS_A100 -ne 0 ] ; then
737+ loss_base=9.38009949
838738 fi
739+ ips=-1
740+ mem=-1
741+ ips_base=-1
742+ mem_base=-1
743+ check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
744+ # check_md5_result $FUNCNAME ${loss_md5_base} ${loss_md5}
745+ done
839746 done
840- ips=-1
841- mem=-1
842- ips_base=-1
843- mem_base=-1
844- check_result $FUNCNAME ${loss1} ${loss2} ${ips_base} ${ips} ${mem_base} ${mem}
845747 echo " =========== $FUNCNAME run end ==========="
846748}
847749
0 commit comments