Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Experimental environment: 2 * A100
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.96
export XLA_EXPERIMENTAL=nonzero:masked_select

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Meta-Llama-3-70B-Instruct
mkdir -p $XLA_PERSISTENT_CACHE_PATH

NPROC_PER_NODE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
MASTER_PORT=27829 \
swift sft \
--model_id_or_path LLM-Research/Meta-Llama-3-70B-Instruct \
--model_layer_cls_name LlamaDecoderLayer \
--dataset codefuse-python-en \
--template_type llama3 \
--sft_type lora \
--output_dir output \
--train_dataset_sample -1 \
--tuner_backend 'peft' \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 6 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--acc_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--fsdp_num 4 \
--report_to 'none'
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Experimental environment: 4 * A100

export USE_TORCH_XLA=0

NPROC_PER_NODE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
MASTER_PORT=29500 \
swift sft \
--model_id_or_path LLM-Research/Meta-Llama-3-70B-Instruct \
--model_revision master \
--sft_type lora \
--dataset codefuse-python-en \
--template_type llama3 \
--dtype AUTO \
--output_dir output \
--ddp_backend nccl \
--train_dataset_sample -1 \
--tuner_backend 'peft' \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 4 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing true \
--dataset_test_ratio 0 \
--save_strategy no \
--eval_steps 2000000 \
--save_steps 2000000 \
--logging_steps 100 \
--acc_steps 100 \
--preprocess_num_proc 1 \
--metric_warmup_step 0.1 \
--deepspeed default-zero3 \
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ swift sft \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 16 \
--batch_size 12 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--dataset_test_ratio 0 \
Expand Down
2 changes: 1 addition & 1 deletion swift/llm/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,5 @@ def get_ta_config():
return config

ta_config = get_ta_config()
model = ta.accelerate(model, ta_config)
model = ta.accelerate(model, config=ta_config)
return model
2 changes: 2 additions & 0 deletions swift/trainers/mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,8 @@ def _maybe_log_save_evaluate(self, tr_loss, *args, **kwargs):
if k == 'loss':
self._total_loss_scalar += v_scalar
logs[k] = round(v_scalar / (self.state.global_step - self._globalstep_last_logged), 8)
if k == 'acc' and self._globalstep_last_logged > 0:
logs[k] *= self.sft_args.acc_steps
if version.parse(transformers.__version__) >= version.parse('4.38'):
grad_norm = args[0]
if isinstance(grad_norm, torch.Tensor):
Expand Down
Loading