huggingface · edbeeching · Apr 4, 2025 · Apr 4, 2025 · qgallouedec · Apr 4, 2025
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
@@ -101,6 +101,8 @@ class GRPOConfig(TrainingArguments):
             speed, but may be numerically unstable for long training runs.
         num_iterations (`int`, *optional*, defaults to `1`):
             Number of iterations per batch (denoted as μ in the algorithm).
+        use_max_tokens_norm (`bool`, *optional*, defaults to `False`):
+            Whether to use the max tokens norm. If `True`, the loss is normalized by a consant, the maximum possible number of tokens
-            Whether to use the max tokens norm. If `True`, the loss is normalized by a consant, the maximum possible number of tokens
+            Whether to use the max tokens norm. If `True`, the loss is normalized by a constant factor that is determined by the total number of prompt and completions tokens in a batch.
-            Whether to use the max tokens norm. If `True`, the loss is normalized by a consant, the maximum possible number of tokens
+            Whether to use the max tokens norm. If `True`, the loss is normalized by a constant factor that is determined by the total number of prompt and completions tokens in a batch.
         epsilon (`float`, *optional*, defaults to `0.2`):
             Epsilon value for clipping.
         epsilon_high (`float` or `None`, *optional*, defaults to `None`):
@@ -275,6 +277,13 @@ class GRPOConfig(TrainingArguments):
         default=1,
         metadata={"help": "Number of iterations per batch (denoted as μ in the algorithm)."},
     )
+    use_max_tokens_norm: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use the max tokens norm. If `True`, the loss is normalized by a constant, the maximum "
+            "possible number of tokens."
+        },
+    )
     epsilon: float = field(
         default=0.2,
         metadata={"help": "Epsilon value for clipping."},

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
@@ -409,6 +409,15 @@ def data_collator(features):  # No data collation is needed in GRPO
         self.use_vllm = args.use_vllm
         self.use_liger_loss = args.use_liger_loss
 
+        self.use_max_tokens_norm = args.use_max_tokens_norm
+        if self.use_max_tokens_norm:
+            if self.use_liger_loss:
+                raise ValueError("`use_max_tokens_norm` is not supported with `liger_loss`.")
+            # calculate a constant factor to normalize the loss
+            self.max_tokens_norm = args.per_device_train_batch_size * (
+                args.max_prompt_length + args.max_completion_length
+            )
+
         # Multi-step
         self.num_iterations = args.num_iterations  # = 𝜇 in the GRPO paper
         self.epsilon_low = args.epsilon
@@ -1072,7 +1081,11 @@ def _compute_loss(self, model, inputs):
         per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
         if self.beta != 0.0:
             per_token_loss = per_token_loss + self.beta * per_token_kl
-        loss = (per_token_loss * completion_mask).sum() / completion_mask.sum()
+
+        if self.use_max_tokens_norm:
+            loss = (per_token_loss * completion_mask).sum() / self.max_tokens_norm
+        else:
+            loss = (per_token_loss * completion_mask).sum() / completion_mask.sum()
 
         # Log the metrics
         mode = "eval" if self.control.should_evaluate else "train"