modelscope · baoleai · Jul 16, 2024 · Jul 16, 2024
diff --git a/examples/pytorch/llm/scripts/torchacc/llama3_70b_instruct/acc_lora_fsdp_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama3_70b_instruct/acc_lora_fsdp_sft.sh
@@ -0,0 +1,39 @@
+# Experimental environment: 2 * A100
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+export USE_TORCHACC=1
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.96
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Meta-Llama-3-70B-Instruct
+mkdir -p $XLA_PERSISTENT_CACHE_PATH
+
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+MASTER_PORT=27829 \
+swift sft \
+  --model_id_or_path LLM-Research/Meta-Llama-3-70B-Instruct \
+  --model_layer_cls_name LlamaDecoderLayer \
+  --dataset codefuse-python-en \
+  --template_type llama3 \
+  --sft_type lora \
+  --output_dir output \
+  --train_dataset_sample -1 \
+  --tuner_backend 'peft' \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 6 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --acc_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --fsdp_num 4 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/llama3_70b_instruct/swift_lora_fsdp_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama3_70b_instruct/swift_lora_fsdp_sft.sh
@@ -0,0 +1,33 @@
+# Experimental environment: 4 * A100
+
+export USE_TORCH_XLA=0
+
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+MASTER_PORT=29500 \
+swift sft \
+    --model_id_or_path LLM-Research/Meta-Llama-3-70B-Instruct \
+    --model_revision master \
+    --sft_type lora \
+    --dataset codefuse-python-en \
+    --template_type llama3 \
+    --dtype AUTO \
+    --output_dir output \
+    --ddp_backend nccl \
+    --train_dataset_sample -1 \
+    --tuner_backend 'peft' \
+    --num_train_epochs 1 \
+    --max_length 2048 \
+    --batch_size 4 \
+    --use_flash_attn true \
+    --gradient_accumulation_steps 1 \
+    --gradient_checkpointing true \
+    --dataset_test_ratio 0 \
+    --save_strategy no \
+    --eval_steps 2000000 \
+    --save_steps 2000000 \
+    --logging_steps 100 \
+    --acc_steps 100 \
+    --preprocess_num_proc 1 \
+    --metric_warmup_step 0.1 \
+    --deepspeed default-zero3 \
diff --git a/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama3_8b_instruct/swift_lora_sft.sh
@@ -15,7 +15,7 @@ swift sft \
   --output_dir output \
   --num_train_epochs 1 \
   --max_length 2048 \
-  --batch_size  16 \
+  --batch_size 12 \
   --use_flash_attn true \
   --gradient_accumulation_steps 1 \
   --dataset_test_ratio 0 \

diff --git a/swift/llm/accelerator.py b/swift/llm/accelerator.py
@@ -30,5 +30,5 @@ def get_ta_config():
         return config
 
     ta_config = get_ta_config()
-    model = ta.accelerate(model, ta_config)
+    model = ta.accelerate(model, config=ta_config)
     return model
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
@@ -570,6 +570,8 @@ def _maybe_log_save_evaluate(self, tr_loss, *args, **kwargs):
                 if k == 'loss':
                     self._total_loss_scalar += v_scalar
                 logs[k] = round(v_scalar / (self.state.global_step - self._globalstep_last_logged), 8)
+                if k == 'acc' and self._globalstep_last_logged > 0:
+                    logs[k] *= self.sft_args.acc_steps
             if version.parse(transformers.__version__) >= version.parse('4.38'):
                 grad_norm = args[0]
                 if isinstance(grad_norm, torch.Tensor):