Skip to content

Commit 876c3b1

Browse files
committed
fix
1 parent 75f5031 commit 876c3b1

File tree

1 file changed

+100
-101
lines changed

1 file changed

+100
-101
lines changed

scripts/distribute/ci_case_auto.sh

Lines changed: 100 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -1359,109 +1359,108 @@ function llama_align_dygraph_dy2st_pir_auto_grad_merge_bs2_fp32_DP1-MP1-PP1() {
13591359
function llama_align_dy2st_fthenb_and_vpp_auto_bs2_fp32_DP1-MP1-PP4() {
13601360
echo "=========== $FUNCNAME run begin ==========="
13611361
# Only A100 support this case.
1362-
if [ $IS_A100 -eq 0 ]; then
1363-
return
1364-
fi
1365-
export FLAGS_call_stack_level=3
1366-
export NVIDIA_TF32_OVERRIDE=0
1367-
export FLAGS_max_inplace_grad_add=3
1368-
1369-
task_name="llama_align_dy2st_fthenb_and_vpp_auto_bs2_fp32_DP1_MP1_PP4"
1370-
case_out_dir="output/$task_name"
1371-
case_log_dir="output/$task_name""_log"
1372-
loss1=0
1373-
loss2=0
1374-
use_pir=1
1362+
if [ $IS_A100 -ne 0 ]; then
1363+
export FLAGS_call_stack_level=3
1364+
export NVIDIA_TF32_OVERRIDE=0
1365+
export FLAGS_max_inplace_grad_add=3
13751366

1376-
max_step=10
1377-
to_static=1
1378-
1379-
for pp_mode in "1F1B" "VPP"; do
1380-
export FLAGS_enable_pir_api=${use_pir}
1381-
export FLAGS_enable_pir_in_executor=${use_pir}
1382-
rm -rf $case_out_dir
1383-
rm -rf $case_log_dir
1384-
rm -rf ${log_path}/$FUNCNAME
1385-
if [ "$pp_mode" == "FThenB" ]; then
1386-
vpp_degree=1
1387-
else
1388-
vpp_degree=2
1389-
fi
1367+
task_name="llama_align_dy2st_fthenb_and_vpp_auto_bs2_fp32_DP1_MP1_PP4"
1368+
case_out_dir="output/$task_name"
1369+
case_log_dir="output/$task_name""_log"
1370+
loss1=0
1371+
loss2=0
1372+
use_pir=1
1373+
1374+
max_step=10
1375+
to_static=1
1376+
1377+
for pp_mode in "1F1B" "VPP"; do
1378+
export FLAGS_enable_pir_api=${use_pir}
1379+
export FLAGS_enable_pir_in_executor=${use_pir}
1380+
rm -rf $case_out_dir
1381+
rm -rf $case_log_dir
1382+
rm -rf ${log_path}/$FUNCNAME
1383+
if [ "$pp_mode" == "FThenB" ]; then
1384+
vpp_degree=1
1385+
else
1386+
vpp_degree=2
1387+
fi
13901388

1391-
python -u -m paddle.distributed.launch \
1392-
--gpus "0,1,2,3" \
1393-
--log_dir $case_log_dir \
1394-
run_pretrain_auto.py \
1395-
--model_type "llama" \
1396-
--model_name_or_path "facebook/llama-7b" \
1397-
--tokenizer_name_or_path "facebook/llama-7b" \
1398-
--input_dir "./data" \
1399-
--output_dir $case_out_dir \
1400-
--split 949,50,1 \
1401-
--weight_decay 0.01 \
1402-
--warmup_ratio 0.01 \
1403-
--warmup_steps 30 \
1404-
--max_grad_norm 0.0 \
1405-
--learning_rate 3e-05 \
1406-
--min_learning_rate 3e-06 \
1407-
--max_steps $max_step \
1408-
--logging_steps 1 \
1409-
--eval_steps 1000 \
1410-
--save_steps 50000 \
1411-
--continue_training 0 \
1412-
--do_train true \
1413-
--do_eval false \
1414-
--do_predict false \
1415-
--disable_tqdm true \
1416-
--skip_profile_timer true \
1417-
--save_total_limit 2 \
1418-
--device gpu \
1419-
--disable_tqdm true \
1420-
--dataloader_num_workers 1 \
1421-
--distributed_dataloader 0 \
1422-
--enable_auto_parallel 1 \
1423-
--per_device_train_batch_size 1 \
1424-
--gradient_accumulation_steps 4 \
1425-
--per_device_eval_batch_size 2 \
1426-
--recompute false \
1427-
--recompute_use_reentrant true \
1428-
--recompute_granularity full \
1429-
--fp16 0 \
1430-
--fp16_opt_level "O2" \
1431-
--fuse_attention_ffn true \
1432-
--fuse_attention_qkv true \
1433-
--fuse_sequence_parallel_allreduce false \
1434-
--use_flash_attention 0 \
1435-
--use_fused_rope false \
1436-
--use_fused_rms_norm 0 \
1437-
--max_seq_length 2048 \
1438-
--hidden_size 1024 \
1439-
--sep_parallel_degree 1 \
1440-
--sequence_parallel false \
1441-
--pipeline_parallel_degree 4 \
1442-
--sharding_parallel_degree 1 \
1443-
--tensor_parallel_degree 1 \
1444-
--sharding "" \
1445-
--to_static ${to_static} \
1446-
--num_hidden_layers 8 \
1447-
--data_parallel_config "gradient_sync_after_accumulate" \
1448-
--pipeline_schedule_mode $pp_mode \
1449-
--virtual_pp_degree $vpp_degree \
1450-
>>${log_path}/$FUNCNAME 2>&1
1451-
1452-
loss=$(grep "global_step: 10," "$case_log_dir/workerlog.0" | grep -oP '(?<=loss: )\d+(\.\d+)?' | awk -F ',' '{print $1}')
1453-
if [ "$pp_mode" == "FThenB" ]; then
1454-
loss1=loss
1455-
else
1456-
loss2=loss
1457-
fi
1458-
echo "result: $pp_mode loss=$loss"
1459-
done
1460-
ips=-1
1461-
mem=-1
1462-
ips_base=-1
1463-
mem_base=-1
1464-
check_result $FUNCNAME ${loss1} ${loss2} ${ips_base} ${ips} ${mem_base} ${mem}
1389+
python -u -m paddle.distributed.launch \
1390+
--gpus "0,1,2,3" \
1391+
--log_dir $case_log_dir \
1392+
run_pretrain_auto.py \
1393+
--model_type "llama" \
1394+
--model_name_or_path "facebook/llama-7b" \
1395+
--tokenizer_name_or_path "facebook/llama-7b" \
1396+
--input_dir "./data" \
1397+
--output_dir $case_out_dir \
1398+
--split 949,50,1 \
1399+
--weight_decay 0.01 \
1400+
--warmup_ratio 0.01 \
1401+
--warmup_steps 30 \
1402+
--max_grad_norm 0.0 \
1403+
--learning_rate 3e-05 \
1404+
--min_learning_rate 3e-06 \
1405+
--max_steps $max_step \
1406+
--logging_steps 1 \
1407+
--eval_steps 1000 \
1408+
--save_steps 50000 \
1409+
--continue_training 0 \
1410+
--do_train true \
1411+
--do_eval false \
1412+
--do_predict false \
1413+
--disable_tqdm true \
1414+
--skip_profile_timer true \
1415+
--save_total_limit 2 \
1416+
--device gpu \
1417+
--disable_tqdm true \
1418+
--dataloader_num_workers 1 \
1419+
--distributed_dataloader 0 \
1420+
--enable_auto_parallel 1 \
1421+
--per_device_train_batch_size 1 \
1422+
--gradient_accumulation_steps 4 \
1423+
--per_device_eval_batch_size 2 \
1424+
--recompute false \
1425+
--recompute_use_reentrant true \
1426+
--recompute_granularity full \
1427+
--fp16 0 \
1428+
--fp16_opt_level "O2" \
1429+
--fuse_attention_ffn true \
1430+
--fuse_attention_qkv true \
1431+
--fuse_sequence_parallel_allreduce false \
1432+
--use_flash_attention 0 \
1433+
--use_fused_rope false \
1434+
--use_fused_rms_norm 0 \
1435+
--max_seq_length 2048 \
1436+
--hidden_size 1024 \
1437+
--sep_parallel_degree 1 \
1438+
--sequence_parallel false \
1439+
--pipeline_parallel_degree 4 \
1440+
--sharding_parallel_degree 1 \
1441+
--tensor_parallel_degree 1 \
1442+
--sharding "" \
1443+
--to_static ${to_static} \
1444+
--num_hidden_layers 8 \
1445+
--data_parallel_config "gradient_sync_after_accumulate" \
1446+
--pipeline_schedule_mode $pp_mode \
1447+
--virtual_pp_degree $vpp_degree \
1448+
>>${log_path}/$FUNCNAME 2>&1
1449+
1450+
loss=$(grep "global_step: 10," "$case_log_dir/workerlog.0" | grep -oP '(?<=loss: )\d+(\.\d+)?' | awk -F ',' '{print $1}')
1451+
if [ "$pp_mode" == "FThenB" ]; then
1452+
loss1=loss
1453+
else
1454+
loss2=loss
1455+
fi
1456+
echo "result: $pp_mode loss=$loss"
1457+
done
1458+
ips=-1
1459+
mem=-1
1460+
ips_base=-1
1461+
mem_base=-1
1462+
check_result $FUNCNAME ${loss1} ${loss2} ${ips_base} ${ips} ${mem_base} ${mem}
1463+
fi
14651464
echo "=========== $FUNCNAME run end ==========="
14661465
}
14671466

0 commit comments

Comments
 (0)