Skip to content

Commit 19311b5

Browse files
committed
update clear_grad
1 parent 5da5c36 commit 19311b5

File tree

2 files changed

+7
-2
lines changed

2 files changed

+7
-2
lines changed

paddlenlp/trainer/trainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1163,7 +1163,7 @@ def fused_allreduce_gradients_no_sync(paramlist, hcg):
11631163
if optimizer_was_run:
11641164
self.lr_scheduler.step()
11651165

1166-
if enable_release_grads:
1166+
if args.release_grads or enable_release_grads:
11671167
self.optimizer.clear_grad(set_to_zero=False)
11681168
if args.pipeline_parallel_degree > 1:
11691169
for _, buffers in model._chunk_2_comm_buffers.items():

paddlenlp/trainer/training_args.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ class TrainingArguments:
271271
enable_stage1_broadcast_overlap, overlap stage1 V1 broadcast with next step forward computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap forward compute and no other sync could be called during the training for broadcast overlap.
272272
enable_stage1_allgather_overlap, overlap stage1 V2 allgather with next step forward computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for allgather overlap forward compute and no other sync could be called during the training for allgather overlap.
273273
disable_stage1_reduce_avg, replace reduce_avg with original reduce_sum+scale in stage1, which can be used for accuracy verification.
274-
enable_release_graHEADds, reduce peak memory usage by releasing gradients after each iteration. The creation of gradients will be postponed until backward propagation of the next iteration.
274+
enable_release_grads, reduce peak memory usage by releasing gradients after each iteration. The creation of gradients will be postponed until backward propagation of the next iteration.
275275
recompute (`bool`, *optional*, defaults to `False`):
276276
Recompute the forward pass to calculate gradients. Used for saving memory.
277277
Only support for networks with transformer blocks.
@@ -355,6 +355,8 @@ class TrainingArguments:
355355
Whether skip profile timer, timer will record time usage of forward/ backward/ step, etc.
356356
distributed_dataloader (`bool`, *optional*):
357357
Whether to use distributed dataloader. Default is `False`.
358+
release_grads (`bool`, *optional*):
359+
Whether to release gradients during training. Default is `False`.
358360
"""
359361

360362
output_dir: str = field(
@@ -832,6 +834,9 @@ class TrainingArguments:
832834
default=False,
833835
metadata={"help": "Enable MoE (Mixture of Experts) expert parallel training"},
834836
)
837+
release_grads: Optional[bool] = field(
838+
default=False, metadata={"help": "Whether to release gradients during training. Default is `False`."}
839+
)
835840

836841
def __post_init__(self):
837842
env_local_rank = int(os.environ.get("PADDLE_RANK_IN_NODE", -1))

0 commit comments

Comments
 (0)