@@ -1359,109 +1359,108 @@ function llama_align_dygraph_dy2st_pir_auto_grad_merge_bs2_fp32_DP1-MP1-PP1() {
13591359function llama_align_dy2st_fthenb_and_vpp_auto_bs2_fp32_DP1-MP1-PP4() {
13601360 echo " =========== $FUNCNAME run begin ==========="
13611361 # Only A100 support this case.
1362- if [ $IS_A100 -eq 0 ]; then
1363- return
1364- fi
1365- export FLAGS_call_stack_level=3
1366- export NVIDIA_TF32_OVERRIDE=0
1367- export FLAGS_max_inplace_grad_add=3
1368-
1369- task_name=" llama_align_dy2st_fthenb_and_vpp_auto_bs2_fp32_DP1_MP1_PP4"
1370- case_out_dir=" output/$task_name "
1371- case_log_dir=" output/$task_name " " _log"
1372- loss1=0
1373- loss2=0
1374- use_pir=1
1362+ if [ $IS_A100 -ne 0 ]; then
1363+ export FLAGS_call_stack_level=3
1364+ export NVIDIA_TF32_OVERRIDE=0
1365+ export FLAGS_max_inplace_grad_add=3
13751366
1376- max_step=10
1377- to_static=1
1378-
1379- for pp_mode in " 1F1B" " VPP" ; do
1380- export FLAGS_enable_pir_api=${use_pir}
1381- export FLAGS_enable_pir_in_executor=${use_pir}
1382- rm -rf $case_out_dir
1383- rm -rf $case_log_dir
1384- rm -rf ${log_path} /$FUNCNAME
1385- if [ " $pp_mode " == " FThenB" ]; then
1386- vpp_degree=1
1387- else
1388- vpp_degree=2
1389- fi
1367+ task_name=" llama_align_dy2st_fthenb_and_vpp_auto_bs2_fp32_DP1_MP1_PP4"
1368+ case_out_dir=" output/$task_name "
1369+ case_log_dir=" output/$task_name " " _log"
1370+ loss1=0
1371+ loss2=0
1372+ use_pir=1
1373+
1374+ max_step=10
1375+ to_static=1
1376+
1377+ for pp_mode in " 1F1B" " VPP" ; do
1378+ export FLAGS_enable_pir_api=${use_pir}
1379+ export FLAGS_enable_pir_in_executor=${use_pir}
1380+ rm -rf $case_out_dir
1381+ rm -rf $case_log_dir
1382+ rm -rf ${log_path} /$FUNCNAME
1383+ if [ " $pp_mode " == " FThenB" ]; then
1384+ vpp_degree=1
1385+ else
1386+ vpp_degree=2
1387+ fi
13901388
1391- python -u -m paddle.distributed.launch \
1392- --gpus " 0,1,2,3" \
1393- --log_dir $case_log_dir \
1394- run_pretrain_auto.py \
1395- --model_type " llama" \
1396- --model_name_or_path " facebook/llama-7b" \
1397- --tokenizer_name_or_path " facebook/llama-7b" \
1398- --input_dir " ./data" \
1399- --output_dir $case_out_dir \
1400- --split 949,50,1 \
1401- --weight_decay 0.01 \
1402- --warmup_ratio 0.01 \
1403- --warmup_steps 30 \
1404- --max_grad_norm 0.0 \
1405- --learning_rate 3e-05 \
1406- --min_learning_rate 3e-06 \
1407- --max_steps $max_step \
1408- --logging_steps 1 \
1409- --eval_steps 1000 \
1410- --save_steps 50000 \
1411- --continue_training 0 \
1412- --do_train true \
1413- --do_eval false \
1414- --do_predict false \
1415- --disable_tqdm true \
1416- --skip_profile_timer true \
1417- --save_total_limit 2 \
1418- --device gpu \
1419- --disable_tqdm true \
1420- --dataloader_num_workers 1 \
1421- --distributed_dataloader 0 \
1422- --enable_auto_parallel 1 \
1423- --per_device_train_batch_size 1 \
1424- --gradient_accumulation_steps 4 \
1425- --per_device_eval_batch_size 2 \
1426- --recompute false \
1427- --recompute_use_reentrant true \
1428- --recompute_granularity full \
1429- --fp16 0 \
1430- --fp16_opt_level " O2" \
1431- --fuse_attention_ffn true \
1432- --fuse_attention_qkv true \
1433- --fuse_sequence_parallel_allreduce false \
1434- --use_flash_attention 0 \
1435- --use_fused_rope false \
1436- --use_fused_rms_norm 0 \
1437- --max_seq_length 2048 \
1438- --hidden_size 1024 \
1439- --sep_parallel_degree 1 \
1440- --sequence_parallel false \
1441- --pipeline_parallel_degree 4 \
1442- --sharding_parallel_degree 1 \
1443- --tensor_parallel_degree 1 \
1444- --sharding " " \
1445- --to_static ${to_static} \
1446- --num_hidden_layers 8 \
1447- --data_parallel_config " gradient_sync_after_accumulate" \
1448- --pipeline_schedule_mode $pp_mode \
1449- --virtual_pp_degree $vpp_degree \
1450- >> ${log_path} /$FUNCNAME 2>&1
1451-
1452- loss=$( grep " global_step: 10," " $case_log_dir /workerlog.0" | grep -oP ' (?<=loss: )\d+(\.\d+)?' | awk -F ' ,' ' {print $1}' )
1453- if [ " $pp_mode " == " FThenB" ]; then
1454- loss1=loss
1455- else
1456- loss2=loss
1457- fi
1458- echo " result: $pp_mode loss=$loss "
1459- done
1460- ips=-1
1461- mem=-1
1462- ips_base=-1
1463- mem_base=-1
1464- check_result $FUNCNAME ${loss1} ${loss2} ${ips_base} ${ips} ${mem_base} ${mem}
1389+ python -u -m paddle.distributed.launch \
1390+ --gpus " 0,1,2,3" \
1391+ --log_dir $case_log_dir \
1392+ run_pretrain_auto.py \
1393+ --model_type " llama" \
1394+ --model_name_or_path " facebook/llama-7b" \
1395+ --tokenizer_name_or_path " facebook/llama-7b" \
1396+ --input_dir " ./data" \
1397+ --output_dir $case_out_dir \
1398+ --split 949,50,1 \
1399+ --weight_decay 0.01 \
1400+ --warmup_ratio 0.01 \
1401+ --warmup_steps 30 \
1402+ --max_grad_norm 0.0 \
1403+ --learning_rate 3e-05 \
1404+ --min_learning_rate 3e-06 \
1405+ --max_steps $max_step \
1406+ --logging_steps 1 \
1407+ --eval_steps 1000 \
1408+ --save_steps 50000 \
1409+ --continue_training 0 \
1410+ --do_train true \
1411+ --do_eval false \
1412+ --do_predict false \
1413+ --disable_tqdm true \
1414+ --skip_profile_timer true \
1415+ --save_total_limit 2 \
1416+ --device gpu \
1417+ --disable_tqdm true \
1418+ --dataloader_num_workers 1 \
1419+ --distributed_dataloader 0 \
1420+ --enable_auto_parallel 1 \
1421+ --per_device_train_batch_size 1 \
1422+ --gradient_accumulation_steps 4 \
1423+ --per_device_eval_batch_size 2 \
1424+ --recompute false \
1425+ --recompute_use_reentrant true \
1426+ --recompute_granularity full \
1427+ --fp16 0 \
1428+ --fp16_opt_level " O2" \
1429+ --fuse_attention_ffn true \
1430+ --fuse_attention_qkv true \
1431+ --fuse_sequence_parallel_allreduce false \
1432+ --use_flash_attention 0 \
1433+ --use_fused_rope false \
1434+ --use_fused_rms_norm 0 \
1435+ --max_seq_length 2048 \
1436+ --hidden_size 1024 \
1437+ --sep_parallel_degree 1 \
1438+ --sequence_parallel false \
1439+ --pipeline_parallel_degree 4 \
1440+ --sharding_parallel_degree 1 \
1441+ --tensor_parallel_degree 1 \
1442+ --sharding " " \
1443+ --to_static ${to_static} \
1444+ --num_hidden_layers 8 \
1445+ --data_parallel_config " gradient_sync_after_accumulate" \
1446+ --pipeline_schedule_mode $pp_mode \
1447+ --virtual_pp_degree $vpp_degree \
1448+ >> ${log_path} /$FUNCNAME 2>&1
1449+
1450+ loss=$( grep " global_step: 10," " $case_log_dir /workerlog.0" | grep -oP ' (?<=loss: )\d+(\.\d+)?' | awk -F ' ,' ' {print $1}' )
1451+ if [ " $pp_mode " == " FThenB" ]; then
1452+ loss1=loss
1453+ else
1454+ loss2=loss
1455+ fi
1456+ echo " result: $pp_mode loss=$loss "
1457+ done
1458+ ips=-1
1459+ mem=-1
1460+ ips_base=-1
1461+ mem_base=-1
1462+ check_result $FUNCNAME ${loss1} ${loss2} ${ips_base} ${ips} ${mem_base} ${mem}
1463+ fi
14651464 echo " =========== $FUNCNAME run end ==========="
14661465}
14671466
0 commit comments