Skip to content

Commit efa0114

Browse files
🎬 Clip higher (huggingface#3118)
* epsilon range added * epsilon doc str updated * test removed * pre-commit run * Update trl/trainer/grpo_config.py Co-authored-by: Quentin Gallouédec <[email protected]> * upper epsilon updated * precommit updates added * minor format and dtype fixes * moving upper bound computation in init * hf.co for paper link --------- Co-authored-by: Quentin Gallouédec <[email protected]> Co-authored-by: Quentin Gallouédec <[email protected]>
1 parent d0da1a7 commit efa0114

File tree

2 files changed

+13
-2
lines changed

2 files changed

+13
-2
lines changed

trl/trainer/grpo_config.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,9 @@ class GRPOConfig(TrainingArguments):
115115
Number of iterations per batch (denoted as μ in the algorithm).
116116
epsilon (`float`, *optional*, defaults to `0.2`):
117117
Epsilon value for clipping.
118+
epsilon_high (`float` or `None`, *optional*, defaults to `None`):
119+
Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound
120+
specified in argument `epsilon`. Paper [DAPO](https://huggingface.co/papers/2503.14476) recommends `0.28`.
118121
reward_weights (`list[float]` or `None`, *optional*, defaults to `None`):
119122
Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are
120123
weighted equally with weight `1.0`.
@@ -300,6 +303,13 @@ class GRPOConfig(TrainingArguments):
300303
default=0.2,
301304
metadata={"help": "Epsilon value for clipping."},
302305
)
306+
epsilon_high: Optional[float] = field(
307+
default=None,
308+
metadata={
309+
"help": "Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the "
310+
"lower-bound specified in argument `epsilon`. Paper DAPO recommends `0.28`."
311+
},
312+
)
303313
reward_weights: Optional[list[float]] = field(
304314
default=None,
305315
metadata={

trl/trainer/grpo_trainer.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,8 @@ def data_collator(features): # No data collation is needed in GRPO
388388

389389
# Multi-step
390390
self.num_iterations = args.num_iterations # = 𝜇 in the GRPO paper
391-
self.epsilon = args.epsilon
391+
self.epsilon_low = args.epsilon
392+
self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon
392393
# Tracks the number of iterations (forward + backward passes), including those within a gradient accumulation cycle.
393394
self._step = 0
394395
# Buffer the batch to reuse generated outputs across multiple updates. For more details, see
@@ -975,7 +976,7 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
975976
# _generate_and_score_completions) and use per_token_logps.detach() instead.
976977
old_per_token_logps = inputs["old_per_token_logps"] if self.num_iterations > 1 else per_token_logps.detach()
977978
coef_1 = torch.exp(per_token_logps - old_per_token_logps)
978-
coef_2 = torch.clamp(coef_1, 1 - self.epsilon, 1 + self.epsilon)
979+
coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
979980
per_token_loss1 = coef_1 * advantages.unsqueeze(1)
980981
per_token_loss2 = coef_2 * advantages.unsqueeze(1)
981982
per_token_loss = -torch.min(per_token_loss1, per_token_loss2)

0 commit comments

Comments
 (0)