Skip to content

Commit a27e2cc

Browse files
committed
test
1 parent 6ea9dc0 commit a27e2cc

File tree

8 files changed

+349
-9
lines changed

8 files changed

+349
-9
lines changed

llm/run_profile.sh

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
set -x
2+
set -e
3+
unset CUDA_VISIBLE_DEVICES
4+
unset PADDLE_TRAINER_ENDPOINTS
5+
unset DISTRIBUTED_TRAINER_ENDPOINTS
6+
task_name="llama2_7B_sd8_dy2st_pir"
7+
8+
rm -rf output/$task_name/
9+
rm -rf "output/$task_name""_log"
10+
11+
export PYTHONPATH=/root/paddlejob/workspace/env_run/wangmingdong/Paddle/build/python:../:$PYTHONPATH:legacy/model_zoo/gpt-3/external_ops/
12+
13+
#export FLAGS_embedding_deterministic=1
14+
#export FLAGS_cudnn_deterministic=1
15+
16+
export PATH=/opt/nvidia/nsight-systems/2023.2.1/bin:$PATH
17+
18+
export CUDA_DEVICE_MAX_CONNECTIONS=1
19+
export PARALLEL_CROSS_ENTROPY=true
20+
21+
export FLAGS_enable_pir_api=1
22+
#export FLAGS_call_stack_level=3
23+
#export GLOG_v=7
24+
#nsys profile --stats=true -t cuda,nvtx,cublas,cudnn -o $task_name --capture-range=cudaProfilerApi --force-overwrite true \
25+
python -u -m paddle.distributed.launch \
26+
--gpus "0,1,2,3" \
27+
--log_dir "output/$task_name""_log" \
28+
./auto_parallel/llama/run_pretrain_auto.py \
29+
--model_name_or_path "meta-llama/Llama-2-7b" \
30+
--tokenizer_name_or_path "meta-llama/Llama-2-7b" \
31+
--input_dir "./data" \
32+
--output_dir "./output" \
33+
--split 949,50,1 \
34+
--weight_decay 0.01 \
35+
--warmup_ratio 0.01 \
36+
--warmup_steps 30 \
37+
--max_grad_norm 1.0 \
38+
--learning_rate 3e-05 \
39+
--min_learning_rate 3e-06 \
40+
--max_steps 30 \
41+
--logging_steps 10 \
42+
--eval_steps 1000 \
43+
--save_steps 50000 \
44+
--continue_training 0 \
45+
--do_train true \
46+
--do_eval false \
47+
--do_predict false \
48+
--disable_tqdm true \
49+
--skip_profile_timer true \
50+
--save_total_limit 2 \
51+
--device gpu \
52+
--disable_tqdm true \
53+
--dataloader_num_workers 1 \
54+
--distributed_dataloader 0 \
55+
--enable_auto_parallel 1 \
56+
--per_device_train_batch_size 1 \
57+
--gradient_accumulation_steps 1 \
58+
--per_device_eval_batch_size 2 \
59+
--recompute false \
60+
--recompute_use_reentrant true \
61+
--recompute_granularity full \
62+
--pp_recompute_interval 0 \
63+
--bf16 true \
64+
--fp16_opt_level "O2" \
65+
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
66+
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
67+
--amp_master_grad true \
68+
--fuse_attention_ffn true \
69+
--fuse_attention_qkv true \
70+
--fused_linear_param_grad_add 1 \
71+
--fuse_sequence_parallel_allreduce false \
72+
--use_flash_attention 0 \
73+
--use_fused_rope true \
74+
--use_fused_rope false \
75+
--use_fused_rms_norm 0 \
76+
--max_seq_length 4096 \
77+
--sep_parallel_degree 1 \
78+
--sequence_parallel false \
79+
--pipeline_parallel_degree 1 \
80+
--sharding_parallel_degree 4 \
81+
--tensor_parallel_degree 1 \
82+
--virtual_pp_degree 1 \
83+
--pipeline_schedule_mode "VPP" \
84+
--sharding "stage2" \
85+
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
86+
--sharding_parallel_config "enable_stage2_overlap" \
87+
--tensor_parallel_config "enable_mp_async_allreduce" \
88+
--to_static 1 \
89+
--num_hidden_layers 1 \
90+
#--nvprof_start 5 \
91+
#--nvprof_end 8 \

llm/run_profile_dy.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
set -x
2+
set -e
3+
unset CUDA_VISIBLE_DEVICES
4+
unset PADDLE_TRAINER_ENDPOINTS
5+
unset DISTRIBUTED_TRAINER_ENDPOINTS
6+
7+
task_name="llama2_7B_sd8_dy_stage1"
8+
9+
rm -rf output/$task_name/
10+
rm -rf "output/$task_name""_log"
11+
12+
13+
export PYTHONPATH=/root/paddlejob/workspace/env_run/wangmingdong/Paddle/build/python:../:$PYTHONPATH:legacy/model_zoo/gpt-3/external_ops/
14+
15+
# export FLAGS_cudnn_deterministic=1
16+
# export NVIDIA_TF32_OVERRIDE=0
17+
# export FLAGS_embedding_deterministic=1
18+
# export FLAGS_flash_attn_version=v1
19+
#export GLOG_v=7
20+
#export FLAGS_log_memory_stats=1
21+
#export PATH=/opt/nvidia/nsight-systems/2023.2.1/bin:$PATH
22+
unset GLOG_v
23+
python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 --run_mode=collective --log_dir ./dy2st_log2 ./run_pretrain.py ./llama_7b.json

llm/run_profile_pir.sh

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
set -x
2+
set -e
3+
unset CUDA_VISIBLE_DEVICES
4+
unset PADDLE_TRAINER_ENDPOINTS
5+
unset DISTRIBUTED_TRAINER_ENDPOINTS
6+
7+
to_static=1 # 是否开启动转静训练
8+
task_name="llama2_7B_sd8_dy2st_pir_fp16"
9+
10+
# to_static=0 # 是否开启动转静训练
11+
# task_name="llama2_7B_sd8_dy_fp16"
12+
13+
rm -rf output/$task_name/
14+
rm -rf "output/$task_name""_log"
15+
16+
export PYTHONPATH=/root/paddlejob/workspace/env_run/wangmingdong/Paddle/build/python:../:$PYTHONPATH:legacy/model_zoo/gpt-3/external_ops/
17+
18+
export FLAGS_cudnn_deterministic=1
19+
export NVIDIA_TF32_OVERRIDE=0
20+
export FLAGS_embedding_deterministic=1
21+
export FLAGS_flash_attn_version=v1
22+
export PATH=/opt/nvidia/nsight-systems/2023.2.1/bin:$PATH
23+
24+
#export PYTHONPATH=/root/paddlejob/workspace/env_run/wangmingdong/Paddle/build/test:/root/paddlejob/workspace/env_run/wangmingdong/Paddle/build/python:../:$PYTHONPATH
25+
26+
# export CUDA_DEVICE_MAX_CONNECTIONS=1
27+
export PARALLEL_CROSS_ENTROPY=true
28+
#export GLOG_v=8
29+
export FLAGS_enable_pir_api=1
30+
export FLAGS_dynamic_static_unified_comm=True
31+
#export FLAGS_log_memory_stats=1
32+
#export FLAGS_call_stack_level=3
33+
#nsys profile --stats=true -t cuda,nvtx,cublas,cudnn -o $task_name --capture-range=cudaProfilerApi --force-overwrite true \
34+
python -u -m paddle.distributed.launch \
35+
--gpus "0,1" \
36+
--log_dir "output/$task_name""_log" \
37+
./auto_parallel/llama/run_pretrain_auto.py \
38+
--model_name_or_path "meta-llama/Llama-2-7b" \
39+
--tokenizer_name_or_path "meta-llama/Llama-2-7b" \
40+
--input_dir "./data" \
41+
--output_dir "./output" \
42+
--split 949,50,1 \
43+
--weight_decay 0.01 \
44+
--warmup_ratio 0.01 \
45+
--warmup_steps 30 \
46+
--max_grad_norm 0.0 \
47+
--learning_rate 3e-05 \
48+
--min_learning_rate 3e-06 \
49+
--max_steps 10 \
50+
--logging_steps 10 \
51+
--eval_steps 1000 \
52+
--save_steps 50000 \
53+
--continue_training 0 \
54+
--do_train true \
55+
--do_eval false \
56+
--do_predict false \
57+
--disable_tqdm true \
58+
--skip_profile_timer true \
59+
--save_total_limit 2 \
60+
--device gpu \
61+
--disable_tqdm true \
62+
--dataloader_num_workers 1 \
63+
--distributed_dataloader 0 \
64+
--enable_auto_parallel 1 \
65+
--per_device_train_batch_size 1 \
66+
--gradient_accumulation_steps 1 \
67+
--per_device_eval_batch_size 2 \
68+
--recompute false \
69+
--recompute_use_reentrant true \
70+
--recompute_granularity full \
71+
--pp_recompute_interval 0 \
72+
--fp16 1\
73+
--fp16_opt_level "O2" \
74+
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
75+
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
76+
--amp_master_grad true \
77+
--fuse_attention_ffn true \
78+
--fuse_attention_qkv false \
79+
--fuse_sequence_parallel_allreduce false \
80+
--use_flash_attention 0 \
81+
--use_fused_rope false \
82+
--use_fused_rms_norm 0 \
83+
--max_seq_length 4096 \
84+
--sep_parallel_degree 1 \
85+
--sequence_parallel false \
86+
--pipeline_parallel_degree 1 \
87+
--sharding_parallel_degree 1 \
88+
--tensor_parallel_degree 1 \
89+
--virtual_pp_degree 1 \
90+
--pipeline_schedule_mode "VPP" \
91+
--sharding "" \
92+
--to_static $to_static \
93+
--num_hidden_layers 2 \
94+
#--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
95+
#--amp_custom_white_list "lookup_table" "lookup_table_v2" \
96+
# --nvprof_start 25 \
97+
# --nvprof_end 29 \
98+
#--sharding_parallel_config "enable_stage2_overlap" \
99+
#--tensor_parallel_config "enable_mp_async_allreduce" \
100+
#--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \

llm/run_profile_st.sh

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
set -x
2+
set -e
3+
unset CUDA_VISIBLE_DEVICES
4+
unset PADDLE_TRAINER_ENDPOINTS
5+
unset DISTRIBUTED_TRAINER_ENDPOINTS
6+
task_name="llama2_7B_sd8_dy2st"
7+
8+
rm -rf output/$task_name/
9+
rm -rf "output/$task_name""_log"
10+
11+
export PYTHONPATH=/root/paddlejob/workspace/env_run/wangmingdong/Paddle/build/python:../:$PYTHONPATH:legacy/model_zoo/gpt-3/external_ops/
12+
13+
#export FLAGS_embedding_deterministic=1
14+
#export FLAGS_cudnn_deterministic=1
15+
16+
export PATH=/opt/nvidia/nsight-systems/2023.2.1/bin:$PATH
17+
18+
export CUDA_DEVICE_MAX_CONNECTIONS=1
19+
export PARALLEL_CROSS_ENTROPY=true
20+
#export GLOG_v=2
21+
#export FLAGS_enable_pir_api=1
22+
#export FLAGS_call_stack_level=3
23+
#nsys profile --stats=true -t cuda,nvtx,cublas,cudnn -o $task_name --capture-range=cudaProfilerApi --force-overwrite true \
24+
python -u -m paddle.distributed.launch \
25+
--gpus "0,1,2,3,4,5,6,7" \
26+
--log_dir "output/$task_name""_log" \
27+
./auto_parallel/llama/run_pretrain_auto.py \
28+
--model_name_or_path "meta-llama/Llama-2-7b" \
29+
--tokenizer_name_or_path "meta-llama/Llama-2-7b" \
30+
--input_dir "./data" \
31+
--output_dir "./output" \
32+
--split 949,50,1 \
33+
--weight_decay 0.01 \
34+
--warmup_ratio 0.01 \
35+
--warmup_steps 30 \
36+
--max_grad_norm 1.0 \
37+
--learning_rate 3e-05 \
38+
--min_learning_rate 3e-06 \
39+
--max_steps 30 \
40+
--logging_steps 10 \
41+
--eval_steps 1000 \
42+
--save_steps 50000 \
43+
--continue_training 0 \
44+
--do_train true \
45+
--do_eval false \
46+
--do_predict false \
47+
--disable_tqdm true \
48+
--skip_profile_timer true \
49+
--save_total_limit 2 \
50+
--device gpu \
51+
--disable_tqdm true \
52+
--dataloader_num_workers 1 \
53+
--distributed_dataloader 0 \
54+
--enable_auto_parallel 1 \
55+
--per_device_train_batch_size 1 \
56+
--gradient_accumulation_steps 1 \
57+
--per_device_eval_batch_size 2 \
58+
--recompute false \
59+
--recompute_use_reentrant true \
60+
--recompute_granularity full \
61+
--pp_recompute_interval 0 \
62+
--bf16 true \
63+
--fp16_opt_level "O2" \
64+
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
65+
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
66+
--amp_master_grad true \
67+
--fuse_attention_ffn true \
68+
--fuse_attention_qkv true \
69+
--fused_linear_param_grad_add 1 \
70+
--fuse_sequence_parallel_allreduce false \
71+
--use_flash_attention 0 \
72+
--use_fused_rope 0 \
73+
--use_fused_rms_norm 0 \
74+
--max_seq_length 4096 \
75+
--sep_parallel_degree 1 \
76+
--sequence_parallel false \
77+
--pipeline_parallel_degree 1 \
78+
--sharding_parallel_degree 8 \
79+
--tensor_parallel_degree 1 \
80+
--virtual_pp_degree 1 \
81+
--pipeline_schedule_mode "VPP" \
82+
--sharding "stage1" \
83+
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
84+
--sharding_parallel_config "enable_stage2_overlap" \
85+
--tensor_parallel_config "enable_mp_async_allreduce" \
86+
--to_static 1 \
87+
--num_hidden_layers 8 \
88+
#--nvprof_start 5 \
89+
#--nvprof_end 8 \

paddlenlp/trainer/auto_trainer.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import os
1616
import random
1717
import time
18+
import sys
19+
import hashlib
1820
from typing import Any, Dict, Optional, Union
1921

2022
import numpy as np
@@ -129,12 +131,7 @@ def _wrap_for_auto(self, model, train_dataloader):
129131

130132
def _wrap_amp_model(self, args, model):
131133
logger.info("Using half precision")
132-
if args.to_static:
133-
return
134-
self.enable_autocast_context_manager = True
135-
self.do_grad_scaling = True if self.args.fp16 else False
136-
self.amp_dtype = "float16" if self.args.fp16 else "bfloat16"
137-
self.scaler = dist.shard_scaler(paddle.amp.GradScaler(init_loss_scaling=self.args.scale_loss))
134+
self.amp_dtype = "float16" if self.args.fp16 else "bfloat16"
138135
if self.args.fp16_opt_level == "O2":
139136
paddle.amp.decorate(
140137
models=model,
@@ -143,6 +140,11 @@ def _wrap_amp_model(self, args, model):
143140
master_grad=self.args.amp_master_grad,
144141
excluded_layers=QuantizationLinear,
145142
)
143+
if args.to_static:
144+
return
145+
self.enable_autocast_context_manager = True
146+
self.do_grad_scaling = True if self.args.fp16 else False
147+
self.scaler = dist.shard_scaler(paddle.amp.GradScaler(init_loss_scaling=self.args.scale_loss))
146148

147149
def _get_item_from_loss(self, loss):
148150
if isinstance(loss, paddle.Tensor):
@@ -306,7 +308,6 @@ def _inner_training_loop(
306308

307309
with _exec_mode_guard("dynamic"):
308310
tr_loss += tr_loss_step
309-
310311
disable_accumulation = self.args.pipeline_parallel_degree > 1 and self.args.to_static
311312
# disable_accumulation = self.args.to_static
312313

@@ -340,6 +341,20 @@ def _inner_training_loop(
340341
self._maybe_log_save_evaluate(tr_loss, model, epoch, ignore_keys_for_eval, inputs=inputs)
341342
self._print_timer()
342343
step_control = 0
344+
if self.args.nvprof_start < self.args.nvprof_end:
345+
# for end
346+
if self.state.global_step - 1 >= self.args.nvprof_start:
347+
paddle.base.core.nvprof_nvtx_pop()
348+
if self.state.global_step == self.args.nvprof_end:
349+
paddle.base.core.nvprof_stop()
350+
sys.exit()
351+
352+
# for begin
353+
if self.state.global_step == self.args.nvprof_start:
354+
paddle.base.core.nvprof_start()
355+
paddle.base.core.nvprof_enable_record_event()
356+
if self.state.global_step >= self.args.nvprof_start:
357+
paddle.base.core.nvprof_nvtx_push(str(self.state.global_step))
343358
else:
344359
self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
345360
step_control += 1
@@ -479,11 +494,15 @@ def training_step(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor,
479494
model.train()
480495

481496
inputs = self._prepare_inputs(inputs)
482-
483497
if not self.args.to_static:
484498
loss = self.dynamic_traning(model, inputs)
499+
print("auto trainer dygraph md5sum:", loss._md5sum(), flush =True)
485500
else:
486501
loss = self.static_traning(model, inputs)
502+
numpy_array = np.array(loss)
503+
array_bytes = numpy_array.tobytes()
504+
loss_md5 = hashlib.md5(array_bytes).hexdigest()
505+
print("auto trainer static md5sum:", loss_md5, flush =True)
487506

488507
if isinstance(loss, paddle.Tensor):
489508
return loss.detach() if loss._is_initialized() else float(0.0)

0 commit comments

Comments
 (0)