Skip to content

Commit 5e1eb64

Browse files
authored
Merge pull request #1492 from Zhikaiiii/feat/torchacc_add_qwen2
[TorchAcc] add script for qwen2 in torchacc
2 parents d7b48b6 + 805e521 commit 5e1eb64

File tree

8 files changed

+230
-1
lines changed

8 files changed

+230
-1
lines changed
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Experimental environment: 4 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
export USE_TORCHACC=1
5+
# export TORCHACC_TRIM_GRAPH=1
6+
export XLA_IR_SHAPE_CACHE_SIZE=1000000000
7+
export XLA_ALLOCATOR_FRACTION=0.96
8+
export XLA_EXPERIMENTAL=nonzero:masked_select
9+
10+
export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/qwen2-72b-instruct-0724
11+
export XLA_FLAGS="--xla_gpu_enable_cudnn_fmha=false --xla_gpu_enable_priority_fusion=false --xla_gpu_normalize_layouts=false --xla_gpu_memory_limit_slop_factor=400"
12+
mkdir -p $XLA_PERSISTENT_CACHE_PATH
13+
14+
NPROC_PER_NODE=4 \
15+
CUDA_VISIBLE_DEVICES=0,1,2,3 \
16+
MASTER_PORT=27829 \
17+
swift sft \
18+
--model_type qwen2-72b-instruct \
19+
--model_layer_cls_name Qwen2DecoderLayer \
20+
--dataset codefuse-python-en \
21+
--sft_type lora \
22+
--output_dir output \
23+
--num_train_epochs 1 \
24+
--max_length 2048 \
25+
--batch_size 6 \
26+
--use_flash_attn true \
27+
--gradient_accumulation_steps 1 \
28+
--gradient_checkpointing no \
29+
--tuner_backend 'peft' \
30+
--dataset_test_ratio 0 \
31+
--save_strategy no \
32+
--eval_steps 2000000 \
33+
--save_steps 2000000 \
34+
--logging_steps 100 \
35+
--acc_steps 100 \
36+
--preprocess_num_proc 1 \
37+
--metric_warmup_step 0.1 \
38+
--fsdp_num 4 \
39+
--report_to 'none'
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"compute_environment": "LOCAL_MACHINE",
3+
"debug": false,
4+
"distributed_type": "FSDP",
5+
"downcast_bf16": "no",
6+
"fsdp_config": {
7+
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
8+
"fsdp_backward_prefetch": "BACKWARD_PRE",
9+
"fsdp_cpu_ram_efficient_loading": true,
10+
"fsdp_forward_prefetch": false,
11+
"fsdp_offload_params": true,
12+
"fsdp_sharding_strategy": "FULL_SHARD",
13+
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
14+
"fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
15+
"fsdp_sync_module_states": true,
16+
"fsdp_use_orig_params": true
17+
},
18+
"machine_rank": 0,
19+
"main_training_function": "main",
20+
"mixed_precision": "bf16",
21+
"num_machines": 1,
22+
"num_processes": 4,
23+
"rdzv_backend": "static",
24+
"same_network": true,
25+
"tpu_env": [],
26+
"tpu_use_cluster": false,
27+
"tpu_use_sudo": false,
28+
"use_cpu": false
29+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Experimental environment: 4 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
5+
export USE_TORCH_XLA=0
6+
7+
# PYTHONPATH=../../.. \
8+
# NPROC_PER_NODE=4 \
9+
10+
NPROC_PER_NODE=4 \
11+
CUDA_VISIBLE_DEVICES=0,1,2,3 \
12+
swift sft \
13+
--model_type qwen2-72b-instruct \
14+
--dataset codefuse-python-en \
15+
--sft_type lora \
16+
--dtype AUTO \
17+
--output_dir output \
18+
--num_train_epochs 1 \
19+
--max_length 1024 \
20+
--batch_size 4 \
21+
--use_flash_attn true \
22+
--gradient_accumulation_steps 1 \
23+
--dataset_test_ratio 0 \
24+
--save_strategy no \
25+
--eval_steps 2000000 \
26+
--save_steps 2000000 \
27+
--logging_steps 100 \
28+
--acc_steps 100 \
29+
--preprocess_num_proc 1 \
30+
--metric_warmup_step 0.1 \
31+
--deepspeed default-zero3 \
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Experimental environment: 4 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
5+
export USE_TORCH_XLA=0
6+
7+
NPROC_PER_NODE=2 \
8+
CUDA_VISIBLE_DEVICES=0,1,2,3 \
9+
swift sft \
10+
--model_type qwen2-72b-instruct \
11+
--dataset codefuse-python-en \
12+
--sft_type lora \
13+
--dtype AUTO \
14+
--output_dir output \
15+
--num_train_epochs 1 \
16+
--max_length 2048 \
17+
--batch_size 1 \
18+
--use_flash_attn true \
19+
--gradient_accumulation_steps 1 \
20+
--dataset_test_ratio 0 \
21+
--save_strategy no \
22+
--eval_steps 2000000 \
23+
--save_steps 2000000 \
24+
--logging_steps 100 \
25+
--acc_steps 100 \
26+
--preprocess_num_proc 1 \
27+
--metric_warmup_step 0.1 \
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Experimental environment: 2 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
5+
export USE_TORCHACC=1
6+
export TORCHACC_TRIM_GRAPH=1
7+
export XLA_IR_SHAPE_CACHE_SIZE=100000000
8+
export XLA_ALLOCATOR_FRACTION=0.95
9+
export XLA_EXPERIMENTAL=nonzero:masked_select
10+
11+
export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/qwen2-7b-instruct
12+
mkdir -p $XLA_PERSISTENT_CACHE_PATH
13+
14+
NPROC_PER_NODE=2 \
15+
CUDA_VISIBLE_DEVICES=2,3 \
16+
MASTER_PORT=21779 \
17+
swift sft \
18+
--model_type qwen2-7b-instruct \
19+
--model_layer_cls_name Qwen2DecoderLayer \
20+
--dataset codefuse-python-en \
21+
--sft_type lora \
22+
--output_dir output \
23+
--num_train_epochs 1 \
24+
--max_length 2048 \
25+
--batch_size 16 \
26+
--use_flash_attn true \
27+
--gradient_accumulation_steps 1 \
28+
--gradient_checkpointing no \
29+
--tuner_backend 'peft' \
30+
--dataset_test_ratio 0 \
31+
--save_strategy no \
32+
--eval_steps 2000000 \
33+
--save_steps 2000000 \
34+
--logging_steps 100 \
35+
--acc_steps 100 \
36+
--preprocess_num_proc 1 \
37+
--metric_warmup_step 0.1 \
38+
--report_to 'none'
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Experimental environment: 2 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
export USE_TORCHACC=1
5+
export XLA_IR_SHAPE_CACHE_SIZE=100000000
6+
export XLA_ALLOCATOR_FRACTION=0.95
7+
export XLA_EXPERIMENTAL=nonzero:masked_select
8+
9+
export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/qwen2-7b-instruct
10+
mkdir -p $XLA_PERSISTENT_CACHE_PATH
11+
12+
NPROC_PER_NODE=2 \
13+
CUDA_VISIBLE_DEVICES=0,1 \
14+
MASTER_PORT=27829 \
15+
swift sft \
16+
--model_type qwen2-7b-instruct \
17+
--model_layer_cls_name Qwen2DecoderLayer \
18+
--dataset codefuse-python-en \
19+
--sft_type lora \
20+
--output_dir output \
21+
--num_train_epochs 1 \
22+
--max_length 2048 \
23+
--batch_size 12 \
24+
--use_flash_attn true \
25+
--gradient_accumulation_steps 1 \
26+
--gradient_checkpointing no \
27+
--tuner_backend 'peft' \
28+
--dataset_test_ratio 0 \
29+
--save_strategy no \
30+
--eval_steps 2000000 \
31+
--save_steps 2000000 \
32+
--logging_steps 100 \
33+
--acc_steps 100 \
34+
--preprocess_num_proc 1 \
35+
--metric_warmup_step 0.1 \
36+
--fsdp_num 2 \
37+
--report_to 'none'
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Experimental environment: 2 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
5+
export USE_TORCH_XLA=0
6+
7+
NPROC_PER_NODE=2 \
8+
CUDA_VISIBLE_DEVICES=0,1 \
9+
swift sft \
10+
--model_type qwen2-7b-instruct \
11+
--dataset codefuse-python-en \
12+
--sft_type lora \
13+
--dtype AUTO \
14+
--output_dir output \
15+
--num_train_epochs 1 \
16+
--max_length 2048 \
17+
--batch_size 12 \
18+
--use_flash_attn true \
19+
--gradient_accumulation_steps 1 \
20+
--dataset_test_ratio 0 \
21+
--save_strategy no \
22+
--eval_steps 2000000 \
23+
--save_steps 2000000 \
24+
--logging_steps 100 \
25+
--acc_steps 100 \
26+
--preprocess_num_proc 1 \
27+
--metric_warmup_step 0.1 \
28+
--report_to 'none'

swift/torchacc_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
363363
def patch_acc_model(model, args):
364364
if not args.use_flash_attn:
365365
logger.warn('Currently use flash attn for torchacc.')
366-
if args.model_type.startswith('qwen1half'):
366+
if args.model_type.startswith('qwen1half') or args.model_type.startswith('qwen2'):
367367
model = patch_qwen2_model(model)
368368
elif args.model_type.startswith('qwen'):
369369
import torchacc as ta

0 commit comments

Comments
 (0)