Skip to content

Commit dcea04c

Browse files
committed
merge test
1 parent d203f2a commit dcea04c

File tree

1 file changed

+71
-169
lines changed

1 file changed

+71
-169
lines changed

scripts/distribute/ci_case_auto.sh

Lines changed: 71 additions & 169 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ function llama_case_list_auto() {
9494
llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2-VPP3_split_bw
9595
llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2
9696
llama_align_dygraph_dy2st_auto_bs2_bf16_DP2-MP1-PP1
97-
llama_pir_auto_recompute_DP2_MP2_PP2
9897
llama_pir_auto_fuse_ffn_attention_qkv_MP2
9998
llama_convert_hybrid_ckpt_to_auto_parallel_bs2_fp32_DP2-MP1-PP1
10099
llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP
@@ -668,180 +667,83 @@ function llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP() {
668667
case_log_dir="output/$task_name""_log"
669668

670669
for to_static in "0" "1"; do
671-
rm -rf $case_out_dir
672-
rm -rf $case_log_dir
673-
python -u -m paddle.distributed.launch \
674-
--gpus "0,1,2,3" \
675-
--log_dir $case_log_dir \
676-
run_pretrain_auto.py \
677-
--model_type "llama" \
678-
--model_name_or_path "facebook/llama-7b" \
679-
--tokenizer_name_or_path "facebook/llama-7b" \
680-
--input_dir "./data" \
681-
--output_dir $case_out_dir \
682-
--split 949,50,1 \
683-
--weight_decay 0.01 \
684-
--warmup_ratio 0.01 \
685-
--max_grad_norm 0.0 \
686-
--learning_rate 3e-05 \
687-
--min_learning_rate 3e-06 \
688-
--max_steps 10 \
689-
--logging_steps 10 \
690-
--eval_steps 1000 \
691-
--save_steps 50000 \
692-
--continue_training 0 \
693-
--do_train true \
694-
--do_eval false \
695-
--do_predict false \
696-
--disable_tqdm true \
697-
--skip_profile_timer true \
698-
--save_total_limit 2 \
699-
--device gpu \
700-
--disable_tqdm true \
701-
--dataloader_num_workers 1 \
702-
--enable_auto_parallel 1 \
703-
--per_device_train_batch_size 1 \
704-
--gradient_accumulation_steps 1 \
705-
--per_device_eval_batch_size 2 \
706-
--recompute false \
707-
--bf16 1\
708-
--fp16_opt_level "O2" \
709-
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
710-
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
711-
--amp_master_grad 1 \
712-
--fuse_attention_ffn false \
713-
--fuse_attention_qkv false \
714-
--fuse_sequence_parallel_allreduce false \
715-
--use_flash_attention 0 \
716-
--use_fused_rope false \
717-
--use_fused_rms_norm 0 \
718-
--max_seq_length 4096 \
719-
--sep_parallel_degree 1 \
720-
--sequence_parallel true \
721-
--pipeline_parallel_degree 1 \
722-
--sharding_parallel_degree 1 \
723-
--tensor_parallel_degree 2 \
724-
--virtual_pp_degree 1 \
725-
--sharding "" \
726-
--to_static ${to_static} \
727-
--num_hidden_layers 4 \
728-
>>${log_path}/$FUNCNAME 2>&1
729-
loss=`cat $case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
730-
loss_md5=`cat $case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss_md5: ' '{print $2}' | awk -F ',' '{print $1}'`
731-
ips=-1
732-
mem=-1
733-
echo "result: to_static=$to_static loss=$loss ips=$ips mem=$mem"
734-
loss_base=9.16783295
735-
loss_md5_base=8ea72495fba4e1b9ba004b4431e27218
736-
if [ $IS_A100 -ne 0 ];then
737-
loss_base=9.38009949
738-
fi
739-
ips_base=-1
740-
mem_base=-1
741-
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
742-
# check_md5_result $FUNCNAME ${loss_md5_base} ${loss_md5}
743-
done
744-
echo "=========== $FUNCNAME run end ==========="
745-
}
746-
747-
function llama_pir_auto_recompute_DP2_MP2_PP2(){
748-
echo "=========== $FUNCNAME run begin ==========="
749-
export PYTHONPATH=$root_path/:$PYTHONPATH
750-
export PYTHONPATH=/paddle/Paddle/build_gpu/python/:$PYTHONPATH
751-
export FLAGS_call_stack_level=3
752-
export FLAGS_enable_pir_api=1
753-
export FLAGS_dynamic_static_unified_comm=1
754-
export FLAGS_enable_auto_parallel_align_mode=1
755-
756-
export NVIDIA_TF32_OVERRIDE=0
757-
export FLAGS_cudnn_deterministic=1
758-
export FLAGS_embedding_deterministic=1
759-
760-
task_name="llama_pir_auto_recompute_DP2_MP2_PP2"
761-
case_out_dir="output/$task_name"
762-
case_log_dir="output/$task_name""_log"
763-
764-
loss1=0
765-
loss2=0
766-
767-
for use_recompute in "0" "1"; do
768-
rm -rf $case_out_dir
769-
rm -rf $case_log_dir
770-
python -u -m paddle.distributed.launch \
771-
--gpus "0,1,2,3,4,5,6,7" \
772-
--log_dir $case_log_dir \
773-
run_pretrain_auto.py \
774-
--model_type "llama" \
775-
--model_name_or_path "facebook/llama-7b" \
776-
--tokenizer_name_or_path "facebook/llama-7b" \
777-
--input_dir "./data" \
778-
--output_dir $case_out_dir \
779-
--split 949,50,1 \
780-
--to_static true \
781-
--pipeline_parallel_degree 2 \
782-
--tensor_parallel_degree 2 \
783-
--virtual_pp_degree 1 \
784-
--pipeline_schedule_mode "VPP" \
785-
--weight_decay 0.01 \
786-
--warmup_ratio 0.01 \
787-
--max_grad_norm 0.0 \
788-
--learning_rate 3e-05 \
789-
--min_learning_rate 3e-06 \
790-
--max_steps 10 \
791-
--logging_steps 10 \
792-
--eval_steps 10000 \
793-
--save_steps 1000 \
794-
--continue_training 0 \
795-
--do_train true \
796-
--do_eval false \
797-
--do_predict false \
798-
--disable_tqdm true \
799-
--save_total_limit 2 \
800-
--device gpu \
801-
--dataloader_num_workers 4 \
802-
--distributed_dataloader 0 \
803-
--enable_auto_parallel 1 \
804-
--per_device_train_batch_size 1 \
805-
--gradient_accumulation_steps 1 \
806-
--per_device_eval_batch_size 1 \
807-
--recompute ${use_recompute} \
808-
--recompute_use_reentrant true \
809-
--recompute_granularity full \
810-
--pp_recompute_interval 0 \
811-
--bf16 true \
812-
--fp16_opt_level "O2" \
813-
--amp_master_grad true \
814-
--fuse_attention_ffn true \
815-
--fuse_attention_qkv true \
816-
--use_flash_attention false \
817-
--use_fused_rope true \
818-
--use_fused_rms_norm false \
819-
--max_seq_length 4096 \
820-
--sequence_parallel false \
821-
--sharding "stage1" \
822-
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate " \
823-
--sharding_parallel_config "enable_stage1_overlap" \
824-
--tensor_parallel_config "enable_mp_async_allreduce" \
825-
--pipeline_parallel_config "enable_send_recv_overlap" \
826-
--auto_parallel_resume_form_hybrid_parallel true \
827-
--num_hidden_layers 4 \
828-
>>${log_path}/$FUNCNAME 2>&1
670+
for use_recompute in "1" "0"; do
671+
rm -rf $case_out_dir
672+
rm -rf $case_log_dir
673+
python -u -m paddle.distributed.launch \
674+
--gpus "0,1,2,3" \
675+
--log_dir $case_log_dir \
676+
run_pretrain_auto.py \
677+
--model_type "llama" \
678+
--model_name_or_path "facebook/llama-7b" \
679+
--tokenizer_name_or_path "facebook/llama-7b" \
680+
--input_dir "./data" \
681+
--output_dir $case_out_dir \
682+
--split 949,50,1 \
683+
--weight_decay 0.01 \
684+
--warmup_ratio 0.01 \
685+
--max_grad_norm 0.0 \
686+
--learning_rate 3e-05 \
687+
--min_learning_rate 3e-06 \
688+
--max_steps 10 \
689+
--logging_steps 10 \
690+
--eval_steps 1000 \
691+
--save_steps 50000 \
692+
--continue_training 0 \
693+
--do_train true \
694+
--do_eval false \
695+
--do_predict false \
696+
--disable_tqdm true \
697+
--skip_profile_timer true \
698+
--save_total_limit 2 \
699+
--device gpu \
700+
--disable_tqdm true \
701+
--dataloader_num_workers 1 \
702+
--enable_auto_parallel 1 \
703+
--per_device_train_batch_size 1 \
704+
--gradient_accumulation_steps 1 \
705+
--per_device_eval_batch_size 2 \
706+
--recompute ${use_recompute} \
707+
--bf16 1\
708+
--fp16_opt_level "O2" \
709+
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
710+
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
711+
--amp_master_grad 1 \
712+
--fuse_attention_ffn false \
713+
--fuse_attention_qkv false \
714+
--fuse_sequence_parallel_allreduce false \
715+
--use_flash_attention 0 \
716+
--use_fused_rope false \
717+
--use_fused_rms_norm 0 \
718+
--max_seq_length 4096 \
719+
--sep_parallel_degree 1 \
720+
--sequence_parallel true \
721+
--pipeline_parallel_degree 1 \
722+
--sharding_parallel_degree 1 \
723+
--tensor_parallel_degree 2 \
724+
--virtual_pp_degree 1 \
725+
--sharding "" \
726+
--to_static ${to_static} \
727+
--num_hidden_layers 4 \
728+
>>${log_path}/$FUNCNAME 2>&1
829729
loss=`cat $case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
830730
loss_md5=`cat $case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss_md5: ' '{print $2}' | awk -F ',' '{print $1}'`
831731
ips=`cat $case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'interval_tokens_per_second_per_device: ' '{print $2}' | awk -F ',' '{print $1}'`
832732
mem=`cat $case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
833-
echo "result: loss=$loss loss_md5=$loss_md5 ips=$ips mem=$mem"
834-
if [ $use_recompute -eq 0 ];then
835-
loss1=($loss)
836-
else
837-
loss2=($loss)
733+
echo "result: to_static=$to_static loss=$loss ips=$ips mem=$mem"
734+
loss_base=9.16783295
735+
loss_md5_base=8ea72495fba4e1b9ba004b4431e27218
736+
if [ $IS_A100 -ne 0 ];then
737+
loss_base=9.38009949
838738
fi
739+
ips=-1
740+
mem=-1
741+
ips_base=-1
742+
mem_base=-1
743+
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
744+
# check_md5_result $FUNCNAME ${loss_md5_base} ${loss_md5}
745+
done
839746
done
840-
ips=-1
841-
mem=-1
842-
ips_base=-1
843-
mem_base=-1
844-
check_result $FUNCNAME ${loss1} ${loss2} ${ips_base} ${ips} ${mem_base} ${mem}
845747
echo "=========== $FUNCNAME run end ==========="
846748
}
847749

0 commit comments

Comments
 (0)