Skip to content

Commit fdb7a4d

Browse files
authored
[TorchAcc][Experimental] Integrate more model in torchacc (#683)
1 parent c8f6153 commit fdb7a4d

31 files changed

+1181
-13
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Experimental environment: 2 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
# torchacc dp
5+
export USE_TORCHACC=1
6+
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
7+
export XLA_IR_SHAPE_CACHE_SIZE=100000000
8+
export XLA_ALLOCATOR_FRACTION=0.95
9+
export XLA_EXPERIMENTAL=nonzero:masked_select
10+
11+
NPROC_PER_NODE=2 \
12+
CUDA_VISIBLE_DEVICES=0,1 \
13+
MASTER_PORT=27829 \
14+
swift sft \
15+
--model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
16+
--model_layer_cls_name BaichuanLayer \
17+
--dataset codefuse-python-en \
18+
--sft_type lora \
19+
--output_dir output \
20+
--num_train_epochs 1 \
21+
--max_length 2048 \
22+
--batch_size 12 \
23+
--use_flash_attn true \
24+
--gradient_accumulation_steps 1 \
25+
--gradient_checkpointing no \
26+
--tuner_backend 'peft' \
27+
--dataset_test_ratio 0 \
28+
--save_strategy no \
29+
--eval_steps 2000000 \
30+
--save_steps 2000000 \
31+
--logging_steps 100 \
32+
--preprocess_num_proc 1 \
33+
--metric_warmup_step 0.1 \
34+
--report_to 'none'
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Experimental environment: 2 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
# torchacc fsdp
5+
export USE_TORCHACC=1
6+
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
7+
export XLA_IR_SHAPE_CACHE_SIZE=100000000
8+
export XLA_ALLOCATOR_FRACTION=0.95
9+
export XLA_EXPERIMENTAL=nonzero:masked_select
10+
11+
NPROC_PER_NODE=2 \
12+
CUDA_VISIBLE_DEVICES=0,1 \
13+
swift sft \
14+
--model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
15+
--model_layer_cls_name BaichuanLayer \
16+
--dataset codefuse-python-en \
17+
--sft_type lora \
18+
--output_dir output \
19+
--num_train_epochs 1 \
20+
--max_length 2048 \
21+
--batch_size 16 \
22+
--use_flash_attn true \
23+
--gradient_accumulation_steps 1 \
24+
--gradient_checkpointing no \
25+
--tuner_backend 'peft' \
26+
--dataset_test_ratio 0 \
27+
--save_strategy no \
28+
--eval_steps 2000000 \
29+
--save_steps 2000000 \
30+
--logging_steps 100 \
31+
--preprocess_num_proc 1 \
32+
--metric_warmup_step 0.1 \
33+
--fsdp_num 2 \
34+
--report_to 'none'
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Experimental environment: 2 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
5+
# MASTER_ADDR=127.0.0.1 \
6+
7+
NPROC_PER_NODE=2 \
8+
CUDA_VISIBLE_DEVICES=0,1 \
9+
swift sft \
10+
--model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
11+
--dataset codefuse-python-en \
12+
--sft_type lora \
13+
--dtype AUTO \
14+
--output_dir output \
15+
--num_train_epochs 1 \
16+
--max_length 2048 \
17+
--batch_size 2 \
18+
--use_flash_attn true \
19+
--gradient_accumulation_steps 1 \
20+
--dataset_test_ratio 0 \
21+
--save_strategy no \
22+
--eval_steps 2000000 \
23+
--save_steps 2000000 \
24+
--logging_steps 100 \
25+
--preprocess_num_proc 1 \
26+
--metric_warmup_step 0.1 \
27+
--report_to 'none'
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Experimental environment: 2 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
# torchacc dp
5+
export USE_TORCHACC=1
6+
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
7+
export XLA_IR_SHAPE_CACHE_SIZE=100000000
8+
export XLA_ALLOCATOR_FRACTION=0.95
9+
export XLA_EXPERIMENTAL=nonzero:masked_select
10+
11+
12+
NPROC_PER_NODE=2 \
13+
CUDA_VISIBLE_DEVICES=0,1 \
14+
MASTER_PORT=27829 \
15+
swift sft \
16+
--model_id_or_path ZhipuAI/chatglm3-6b \
17+
--model_layer_cls_name GLMBlock \
18+
--dataset codefuse-python-en \
19+
--sft_type lora \
20+
--output_dir output \
21+
--num_train_epochs 1 \
22+
--max_length 2048 \
23+
--batch_size 16 \
24+
--use_flash_attn true \
25+
--gradient_accumulation_steps 1 \
26+
--gradient_checkpointing no \
27+
--tuner_backend 'peft' \
28+
--dataset_test_ratio 0 \
29+
--save_strategy no \
30+
--eval_steps 2000000 \
31+
--save_steps 2000000 \
32+
--logging_steps 100 \
33+
--preprocess_num_proc 1 \
34+
--metric_warmup_step 0.1 \
35+
--report_to 'none'
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Experimental environment: 2 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
# torchacc fsdp
5+
export USE_TORCHACC=1
6+
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
7+
export XLA_IR_SHAPE_CACHE_SIZE=100000000
8+
export XLA_ALLOCATOR_FRACTION=0.95
9+
export XLA_EXPERIMENTAL=nonzero:masked_select
10+
11+
12+
NPROC_PER_NODE=2 \
13+
CUDA_VISIBLE_DEVICES=0,1 \
14+
swift sft \
15+
--model_id_or_path ZhipuAI/chatglm3-6b \
16+
--model_layer_cls_name GLMBlock \
17+
--dataset codefuse-python-en \
18+
--sft_type lora \
19+
--output_dir output \
20+
--num_train_epochs 1 \
21+
--max_length 2048 \
22+
--batch_size 16 \
23+
--use_flash_attn true \
24+
--gradient_accumulation_steps 1 \
25+
--gradient_checkpointing no \
26+
--tuner_backend 'peft' \
27+
--dataset_test_ratio 0 \
28+
--save_strategy no \
29+
--eval_steps 2000000 \
30+
--save_steps 2000000 \
31+
--logging_steps 100 \
32+
--preprocess_num_proc 1 \
33+
--metric_warmup_step 0.1 \
34+
--fsdp_num 2 \
35+
--report_to 'none'
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Experimental environment: 2 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
5+
# MASTER_ADDR=127.0.0.1 \
6+
# MASTER_PORT=12356 \
7+
NPROC_PER_NODE=2 \
8+
CUDA_VISIBLE_DEVICES=0,1 \
9+
swift sft \
10+
--model_id_or_path ZhipuAI/chatglm3-6b \
11+
--dataset codefuse-python-en \
12+
--sft_type lora \
13+
--dtype AUTO \
14+
--output_dir output \
15+
--num_train_epochs 1 \
16+
--max_length 2048 \
17+
--batch_size 4 \
18+
--use_flash_attn true \
19+
--gradient_accumulation_steps 1 \
20+
--dataset_test_ratio 0 \
21+
--save_strategy no \
22+
--eval_steps 2000000 \
23+
--save_steps 2000000 \
24+
--logging_steps 100 \
25+
--preprocess_num_proc 1 \
26+
--metric_warmup_step 0.1 \
27+
--report_to 'none'
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Experimental environment: 2 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
5+
export USE_TORCHACC=1
6+
export TORCHACC_TRIM_GRAPH=1
7+
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
8+
export XLA_IR_SHAPE_CACHE_SIZE=100000000
9+
export XLA_ALLOCATOR_FRACTION=0.95
10+
export XLA_EXPERIMENTAL=nonzero:masked_select
11+
12+
NPROC_PER_NODE=2 \
13+
CUDA_VISIBLE_DEVICES=0,1 \
14+
swift sft \
15+
--model_id_or_path modelscope/Llama-2-13b-chat-ms \
16+
--model_layer_cls_name LlamaDecoderLayer \
17+
--dataset codefuse-python-en \
18+
--template_type llama \
19+
--sft_type lora \
20+
--output_dir output \
21+
--num_train_epochs 1 \
22+
--max_length 2048 \
23+
--batch_size 16 \
24+
--use_flash_attn true \
25+
--gradient_accumulation_steps 1 \
26+
--gradient_checkpointing no \
27+
--tuner_backend 'peft' \
28+
--dataset_test_ratio 0 \
29+
--save_strategy no \
30+
--eval_steps 2000000 \
31+
--save_steps 2000000 \
32+
--logging_steps 100 \
33+
--preprocess_num_proc 1 \
34+
--metric_warmup_step 0.1 \
35+
--report_to 'none'
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Experimental environment: 2 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
export USE_TORCHACC=1
5+
export TORCHACC_TRIM_GRAPH=1
6+
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
7+
export XLA_IR_SHAPE_CACHE_SIZE=100000000
8+
export XLA_ALLOCATOR_FRACTION=0.95
9+
export XLA_EXPERIMENTAL=nonzero:masked_select
10+
11+
NPROC_PER_NODE=2 \
12+
CUDA_VISIBLE_DEVICES=0,1 \
13+
MASTER_PORT=27829 \
14+
swift sft \
15+
--model_id_or_path modelscope/Llama-2-13b-chat-ms \
16+
--model_layer_cls_name LlamaDecoderLayer \
17+
--dataset codefuse-python-en \
18+
--template_type llama \
19+
--sft_type lora \
20+
--output_dir output \
21+
--num_train_epochs 1 \
22+
--max_length 2048 \
23+
--batch_size 24 \
24+
--use_flash_attn true \
25+
--gradient_accumulation_steps 1 \
26+
--gradient_checkpointing no \
27+
--tuner_backend 'peft' \
28+
--dataset_test_ratio 0 \
29+
--save_strategy no \
30+
--eval_steps 2000000 \
31+
--save_steps 2000000 \
32+
--logging_steps 100 \
33+
--preprocess_num_proc 1 \
34+
--metric_warmup_step 0.1 \
35+
--fsdp_num 2 \
36+
--report_to 'none'
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Experimental environment: 2 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
5+
# MASTER_ADDR=127.0.0.1 \
6+
7+
NPROC_PER_NODE=2 \
8+
CUDA_VISIBLE_DEVICES=0,1 \
9+
swift sft \
10+
--model_id_or_path modelscope/Llama-2-13b-chat-ms \
11+
--dataset codefuse-python-en \
12+
--sft_type lora \
13+
--dtype AUTO \
14+
--output_dir output \
15+
--num_train_epochs 1 \
16+
--max_length 2048 \
17+
--batch_size 16 \
18+
--use_flash_attn true \
19+
--gradient_accumulation_steps 1 \
20+
--dataset_test_ratio 0 \
21+
--save_strategy no \
22+
--eval_steps 2000000 \
23+
--save_steps 2000000 \
24+
--logging_steps 100 \
25+
--preprocess_num_proc 1 \
26+
--metric_warmup_step 0.1 \
27+
--report_to 'none'
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Experimental environment: 2 * A100
2+
# 80GB GPU memory
3+
# Note: TorchAcc is currently only available internally.
4+
5+
export USE_TORCHACC=1
6+
export TORCHACC_TRIM_GRAPH=1
7+
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
8+
export XLA_IR_SHAPE_CACHE_SIZE=100000000
9+
export XLA_ALLOCATOR_FRACTION=0.95
10+
export XLA_EXPERIMENTAL=nonzero:masked_select
11+
export XLA_COORDINATOR_PORT=12457
12+
13+
NPROC_PER_NODE=2 \
14+
CUDA_VISIBLE_DEVICES=0,1 \
15+
MASTER_PORT=21779 \
16+
swift sft \
17+
--model_id_or_path LLM-Research/Meta-Llama-3-8B-Instruct \
18+
--model_layer_cls_name LlamaDecoderLayer \
19+
--dataset codefuse-python-en \
20+
--template_type llama3 \
21+
--sft_type lora \
22+
--output_dir output \
23+
--num_train_epochs 1 \
24+
--max_length 2048 \
25+
--batch_size 12 \
26+
--use_flash_attn true \
27+
--gradient_accumulation_steps 1 \
28+
--gradient_checkpointing no \
29+
--tuner_backend 'peft' \
30+
--dataset_test_ratio 0 \
31+
--save_strategy no \
32+
--eval_steps 2000000 \
33+
--save_steps 2000000 \
34+
--logging_steps 100 \
35+
--preprocess_num_proc 1 \
36+
--metric_warmup_step 0.1 \
37+
--report_to 'none'

0 commit comments

Comments
 (0)