Merge pull request #1492 from Zhikaiiii/feat/torchacc_add_qwen2

Zhikaiiii · web-flow · commit 5e1eb642e33c · 2024-07-26T13:46:28.000+08:00
[TorchAcc] add script for qwen2 in torchacc
diff --git a/examples/pytorch/llm/scripts/torchacc/qwen2_72b_chat/acc_lora_fsdp_sft.sh b/examples/pytorch/llm/scripts/torchacc/qwen2_72b_chat/acc_lora_fsdp_sft.sh
@@ -0,0 +1,39 @@
+# Experimental environment: 4 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+export USE_TORCHACC=1
+# export TORCHACC_TRIM_GRAPH=1
+export XLA_IR_SHAPE_CACHE_SIZE=1000000000
+export XLA_ALLOCATOR_FRACTION=0.96
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/qwen2-72b-instruct-0724
+export XLA_FLAGS="--xla_gpu_enable_cudnn_fmha=false --xla_gpu_enable_priority_fusion=false --xla_gpu_normalize_layouts=false --xla_gpu_memory_limit_slop_factor=400"
+mkdir -p $XLA_PERSISTENT_CACHE_PATH
+
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+MASTER_PORT=27829 \
+swift sft \
+--model_type qwen2-72b-instruct \
+  --model_layer_cls_name Qwen2DecoderLayer \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 6 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --acc_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 4 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/qwen2_72b_chat/fsdp_offload.json b/examples/pytorch/llm/scripts/torchacc/qwen2_72b_chat/fsdp_offload.json
@@ -0,0 +1,29 @@
+{
+  "compute_environment": "LOCAL_MACHINE",
+  "debug": false,
+  "distributed_type": "FSDP",
+  "downcast_bf16": "no",
+  "fsdp_config": {
+    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+    "fsdp_backward_prefetch": "BACKWARD_PRE",
+    "fsdp_cpu_ram_efficient_loading": true,
+    "fsdp_forward_prefetch": false,
+    "fsdp_offload_params": true,
+    "fsdp_sharding_strategy": "FULL_SHARD",
+    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+    "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
+    "fsdp_sync_module_states": true,
+    "fsdp_use_orig_params": true
+  },
+  "machine_rank": 0,
+  "main_training_function": "main",
+  "mixed_precision": "bf16",
+  "num_machines": 1,
+  "num_processes": 4,
+  "rdzv_backend": "static",
+  "same_network": true,
+  "tpu_env": [],
+  "tpu_use_cluster": false,
+  "tpu_use_sudo": false,
+  "use_cpu": false
+}
diff --git a/examples/pytorch/llm/scripts/torchacc/qwen2_72b_chat/swift_lora_ds.sh b/examples/pytorch/llm/scripts/torchacc/qwen2_72b_chat/swift_lora_ds.sh
@@ -0,0 +1,31 @@
+# Experimental environment: 4 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCH_XLA=0
+
+# PYTHONPATH=../../.. \
+# NPROC_PER_NODE=4 \
+
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift sft \
+  --model_type qwen2-72b-instruct \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 1024 \
+  --batch_size  4 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --acc_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --deepspeed default-zero3 \
diff --git a/examples/pytorch/llm/scripts/torchacc/qwen2_72b_chat/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/qwen2_72b_chat/swift_lora_sft.sh
@@ -0,0 +1,27 @@
+# Experimental environment: 4 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCH_XLA=0
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift sft \
+  --model_type qwen2-72b-instruct \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size  1 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --acc_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
diff --git a/examples/pytorch/llm/scripts/torchacc/qwen2_7b_chat/acc_lora_dp_sft.sh b/examples/pytorch/llm/scripts/torchacc/qwen2_7b_chat/acc_lora_dp_sft.sh
@@ -0,0 +1,38 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCHACC=1
+export TORCHACC_TRIM_GRAPH=1
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/qwen2-7b-instruct
+mkdir -p $XLA_PERSISTENT_CACHE_PATH
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=2,3 \
+MASTER_PORT=21779 \
+swift sft \
+  --model_type qwen2-7b-instruct \
+  --model_layer_cls_name Qwen2DecoderLayer \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --acc_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/qwen2_7b_chat/acc_lora_fsdp_sft.sh b/examples/pytorch/llm/scripts/torchacc/qwen2_7b_chat/acc_lora_fsdp_sft.sh
@@ -0,0 +1,37 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+export USE_TORCHACC=1
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/qwen2-7b-instruct
+mkdir -p $XLA_PERSISTENT_CACHE_PATH
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+MASTER_PORT=27829 \
+swift sft \
+  --model_type qwen2-7b-instruct \
+  --model_layer_cls_name Qwen2DecoderLayer \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 12 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --acc_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 2 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/qwen2_7b_chat/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/qwen2_7b_chat/swift_lora_sft.sh
@@ -0,0 +1,28 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCH_XLA=0
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_type qwen2-7b-instruct \
+  --dataset codefuse-python-en \
+  --sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size  12 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --acc_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
diff --git a/swift/torchacc_utils.py b/swift/torchacc_utils.py
@@ -363,7 +363,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
 def patch_acc_model(model, args):
     if not args.use_flash_attn:
         logger.warn('Currently use flash attn for torchacc.')
-    if args.model_type.startswith('qwen1half'):
+    if args.model_type.startswith('qwen1half') or args.model_type.startswith('qwen2'):
         model = patch_qwen2_model(model)
     elif args.model_type.startswith('qwen'):
         import torchacc as ta