1+ set -x
2+ set -e
3+ unset CUDA_VISIBLE_DEVICES
4+ unset PADDLE_TRAINER_ENDPOINTS
5+ unset DISTRIBUTED_TRAINER_ENDPOINTS
6+
7+ to_static=1 # 是否开启动转静训练
8+ task_name=" llama2_7B_sd8_dy2st_pir_fp16"
9+
10+ # to_static=0 # 是否开启动转静训练
11+ # task_name="llama2_7B_sd8_dy_fp16"
12+
13+ rm -rf output/$task_name /
14+ rm -rf " output/$task_name " " _log"
15+
16+ export PYTHONPATH=/root/paddlejob/workspace/env_run/wangmingdong/Paddle/build/python:../:$PYTHONPATH :legacy/model_zoo/gpt-3/external_ops/
17+
18+ export FLAGS_cudnn_deterministic=1
19+ export NVIDIA_TF32_OVERRIDE=0
20+ export FLAGS_embedding_deterministic=1
21+ export FLAGS_flash_attn_version=v1
22+ export PATH=/opt/nvidia/nsight-systems/2023.2.1/bin:$PATH
23+
24+ # export PYTHONPATH=/root/paddlejob/workspace/env_run/wangmingdong/Paddle/build/test:/root/paddlejob/workspace/env_run/wangmingdong/Paddle/build/python:../:$PYTHONPATH
25+
26+ # export CUDA_DEVICE_MAX_CONNECTIONS=1
27+ export PARALLEL_CROSS_ENTROPY=true
28+ # export GLOG_v=8
29+ export FLAGS_enable_pir_api=1
30+ export FLAGS_dynamic_static_unified_comm=True
31+ # export FLAGS_log_memory_stats=1
32+ # export FLAGS_call_stack_level=3
33+ # nsys profile --stats=true -t cuda,nvtx,cublas,cudnn -o $task_name --capture-range=cudaProfilerApi --force-overwrite true \
34+ python -u -m paddle.distributed.launch \
35+ --gpus " 0,1" \
36+ --log_dir " output/$task_name " " _log" \
37+ ./auto_parallel/llama/run_pretrain_auto.py \
38+ --model_name_or_path " meta-llama/Llama-2-7b" \
39+ --tokenizer_name_or_path " meta-llama/Llama-2-7b" \
40+ --input_dir " ./data" \
41+ --output_dir " ./output" \
42+ --split 949,50,1 \
43+ --weight_decay 0.01 \
44+ --warmup_ratio 0.01 \
45+ --warmup_steps 30 \
46+ --max_grad_norm 0.0 \
47+ --learning_rate 3e-05 \
48+ --min_learning_rate 3e-06 \
49+ --max_steps 10 \
50+ --logging_steps 10 \
51+ --eval_steps 1000 \
52+ --save_steps 50000 \
53+ --continue_training 0 \
54+ --do_train true \
55+ --do_eval false \
56+ --do_predict false \
57+ --disable_tqdm true \
58+ --skip_profile_timer true \
59+ --save_total_limit 2 \
60+ --device gpu \
61+ --disable_tqdm true \
62+ --dataloader_num_workers 1 \
63+ --distributed_dataloader 0 \
64+ --enable_auto_parallel 1 \
65+ --per_device_train_batch_size 1 \
66+ --gradient_accumulation_steps 1 \
67+ --per_device_eval_batch_size 2 \
68+ --recompute false \
69+ --recompute_use_reentrant true \
70+ --recompute_granularity full \
71+ --pp_recompute_interval 0 \
72+ --fp16 1\
73+ --fp16_opt_level " O2" \
74+ --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
75+ --amp_custom_white_list " lookup_table" " lookup_table_v2" \
76+ --amp_master_grad true \
77+ --fuse_attention_ffn true \
78+ --fuse_attention_qkv false \
79+ --fuse_sequence_parallel_allreduce false \
80+ --use_flash_attention 0 \
81+ --use_fused_rope false \
82+ --use_fused_rms_norm 0 \
83+ --max_seq_length 4096 \
84+ --sep_parallel_degree 1 \
85+ --sequence_parallel false \
86+ --pipeline_parallel_degree 1 \
87+ --sharding_parallel_degree 1 \
88+ --tensor_parallel_degree 1 \
89+ --virtual_pp_degree 1 \
90+ --pipeline_schedule_mode " VPP" \
91+ --sharding " " \
92+ --to_static $to_static \
93+ --num_hidden_layers 2 \
94+ # --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
95+ # --amp_custom_white_list "lookup_table" "lookup_table_v2" \
96+ # --nvprof_start 25 \
97+ # --nvprof_end 29 \
98+ # --sharding_parallel_config "enable_stage2_overlap" \
99+ # --tensor_parallel_config "enable_mp_async_allreduce" \
100+ # --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
0 commit comments