Skip to content

Commit ab1d992

Browse files
authored
[TorchAcc] cache the compiled results and remove some xla flags (#1160)
1 parent 2956815 commit ab1d992

File tree

15 files changed

+55
-25
lines changed

15 files changed

+55
-25
lines changed

examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_dp_sft.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
# Note: TorchAcc is currently only available internally.
44
# torchacc dp
55
export USE_TORCHACC=1
6-
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
76
export XLA_IR_SHAPE_CACHE_SIZE=100000000
87
export XLA_ALLOCATOR_FRACTION=0.95
98
export XLA_EXPERIMENTAL=nonzero:masked_select
109

10+
export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Baichuan2-13B-Chat
11+
mkdir -p $XLA_PERSISTENT_CACHE_PATH
12+
1113
NPROC_PER_NODE=2 \
1214
CUDA_VISIBLE_DEVICES=0,1 \
1315
MASTER_PORT=27829 \

examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_fsdp_sft.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
# Note: TorchAcc is currently only available internally.
44
# torchacc fsdp
55
export USE_TORCHACC=1
6-
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
76
export XLA_IR_SHAPE_CACHE_SIZE=100000000
87
export XLA_ALLOCATOR_FRACTION=0.95
98
export XLA_EXPERIMENTAL=nonzero:masked_select
109

10+
export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Baichuan2-13B-Chat
11+
mkdir -p $XLA_PERSISTENT_CACHE_PATH
12+
1113
NPROC_PER_NODE=2 \
1214
CUDA_VISIBLE_DEVICES=0,1 \
1315
swift sft \

examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_dp_sft.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
# Note: TorchAcc is currently only available internally.
44
# torchacc dp
55
export USE_TORCHACC=1
6-
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
76
export XLA_IR_SHAPE_CACHE_SIZE=100000000
87
export XLA_ALLOCATOR_FRACTION=0.95
98
export XLA_EXPERIMENTAL=nonzero:masked_select
109

10+
export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/chatglm3-6b
11+
mkdir -p $XLA_PERSISTENT_CACHE_PATH
12+
1113

1214
NPROC_PER_NODE=2 \
1315
CUDA_VISIBLE_DEVICES=0,1 \

examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_fsdp_sft.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
# Note: TorchAcc is currently only available internally.
44
# torchacc fsdp
55
export USE_TORCHACC=1
6-
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
76
export XLA_IR_SHAPE_CACHE_SIZE=100000000
87
export XLA_ALLOCATOR_FRACTION=0.95
98
export XLA_EXPERIMENTAL=nonzero:masked_select
109

10+
export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/chatglm3-6b
11+
mkdir -p $XLA_PERSISTENT_CACHE_PATH
12+
1113

1214
NPROC_PER_NODE=2 \
1315
CUDA_VISIBLE_DEVICES=0,1 \

examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_dp_sft.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44

55
export USE_TORCHACC=1
66
export TORCHACC_TRIM_GRAPH=1
7-
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
87
export XLA_IR_SHAPE_CACHE_SIZE=100000000
98
export XLA_ALLOCATOR_FRACTION=0.95
109
export XLA_EXPERIMENTAL=nonzero:masked_select
1110

11+
export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Llama-2-13b-chat-ms
12+
mkdir -p $XLA_PERSISTENT_CACHE_PATH
13+
1214
NPROC_PER_NODE=2 \
1315
CUDA_VISIBLE_DEVICES=0,1 \
1416
swift sft \
@@ -20,7 +22,7 @@ swift sft \
2022
--output_dir output \
2123
--num_train_epochs 1 \
2224
--max_length 2048 \
23-
--batch_size 16 \
25+
--batch_size 14 \
2426
--use_flash_attn true \
2527
--gradient_accumulation_steps 1 \
2628
--gradient_checkpointing no \

examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_fsdp_sft.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
# Note: TorchAcc is currently only available internally.
44
export USE_TORCHACC=1
55
export TORCHACC_TRIM_GRAPH=1
6-
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
76
export XLA_IR_SHAPE_CACHE_SIZE=100000000
87
export XLA_ALLOCATOR_FRACTION=0.95
98
export XLA_EXPERIMENTAL=nonzero:masked_select
109

10+
export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Llama-2-13b-chat-ms
11+
mkdir -p $XLA_PERSISTENT_CACHE_PATH
12+
1113
NPROC_PER_NODE=2 \
1214
CUDA_VISIBLE_DEVICES=0,1 \
1315
MASTER_PORT=27829 \
@@ -20,7 +22,7 @@ swift sft \
2022
--output_dir output \
2123
--num_train_epochs 1 \
2224
--max_length 2048 \
23-
--batch_size 24 \
25+
--batch_size 20 \
2426
--use_flash_attn true \
2527
--gradient_accumulation_steps 1 \
2628
--gradient_checkpointing no \

examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_dp_sft.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44

55
export USE_TORCHACC=1
66
export TORCHACC_TRIM_GRAPH=1
7-
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
87
export XLA_IR_SHAPE_CACHE_SIZE=100000000
98
export XLA_ALLOCATOR_FRACTION=0.95
109
export XLA_EXPERIMENTAL=nonzero:masked_select
11-
export XLA_COORDINATOR_PORT=12457
10+
11+
export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Meta-Llama-3-8B-Instruct
12+
mkdir -p $XLA_PERSISTENT_CACHE_PATH
1213

1314
NPROC_PER_NODE=2 \
1415
CUDA_VISIBLE_DEVICES=0,1 \

examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/acc_lora_fsdp_sft.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
# Note: TorchAcc is currently only available internally.
44
export USE_TORCHACC=1
55
export TORCHACC_TRIM_GRAPH=1
6-
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
76
export XLA_IR_SHAPE_CACHE_SIZE=100000000
87
export XLA_ALLOCATOR_FRACTION=0.95
98
export XLA_EXPERIMENTAL=nonzero:masked_select
10-
# export XLA_COORDINATOR_PORT=12457
9+
10+
export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Meta-Llama-3-8B-Instruct
11+
mkdir -p $XLA_PERSISTENT_CACHE_PATH
1112

1213
NPROC_PER_NODE=2 \
1314
CUDA_VISIBLE_DEVICES=0,1 \

examples/pytorch/llm/scripts/torchacc/qwen1half_14b_chat/acc_lora_dp_sft.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22
# 80GB GPU memory
33
# Note: TorchAcc is currently only available internally.
44
export USE_TORCHACC=1
5-
# export TORCHACC_TRIM_GRAPH=1
6-
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
5+
export TORCHACC_TRIM_GRAPH=1
76
export XLA_IR_SHAPE_CACHE_SIZE=1000000000
87
export XLA_ALLOCATOR_FRACTION=0.95
98
export XLA_EXPERIMENTAL=nonzero:masked_select
109

10+
export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/qwen1half-14b-chat
11+
mkdir -p $XLA_PERSISTENT_CACHE_PATH
12+
1113
NPROC_PER_NODE=2 \
1214
CUDA_VISIBLE_DEVICES=2,3 \
1315
MASTER_PORT=23797 \

examples/pytorch/llm/scripts/torchacc/qwen1half_14b_chat/acc_lora_fsdp_sft.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,13 @@ DEBUG_PREFIX=qwen15_14b
55
DEBUG_PATH=torchacc_debug/qwen15/
66
export USE_TORCHACC=1
77
# export TORCHACC_TRIM_GRAPH=1
8-
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
98
export XLA_IR_SHAPE_CACHE_SIZE=1000000000
109
export XLA_ALLOCATOR_FRACTION=0.95
1110
export XLA_EXPERIMENTAL=nonzero:masked_select
11+
12+
export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/qwen1half-14b-chat
13+
mkdir -p $XLA_PERSISTENT_CACHE_PATH
14+
1215
MASTER_PORT=23783 \
1316
NPROC_PER_NODE=2 \
1417
CUDA_VISIBLE_DEVICES=0,1 \

0 commit comments

Comments
 (0)