☝️ [GRPO] Generate once per effective batch (#3283)

qgallouedec · lewtun · web-flow · commit 294f35bf3c00 · 2025-04-17T16:35:58.000-07:00
Co-authored-by: lewtun &lt;lewis.c.tunstall@gmail.com&gt;
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
@@ -1038,3 +1038,34 @@ def test_training_with_mask_truncated_completions_all_masked(self):
             for n, param in previous_trainable_params.items():
                 new_param = trainer.model.get_parameter(n)
                 self.assertTrue(torch.equal(param, new_param), f"Parameter {n} has changed.")
+
+    def test_training_num_generations_larger_than_batch_size(self):
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = GRPOConfig(
+                output_dir=tmp_dir,
+                learning_rate=0.1,  # increase the learning rate to speed up the test
+                per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+                max_completion_length=8,  # reduce the completion length to reduce memory usage
+                num_generations=6,  # the number of generations is larger than the batch size, but
+                gradient_accumulation_steps=2,  # gradient accumulation should allow that
+                report_to="none",
+            )
+            trainer = GRPOTrainer(
+                model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+                reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+                args=training_args,
+                train_dataset=dataset,
+            )
+
+            previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+            trainer.train()
+
+            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+
+            # Check that the params have changed
+            for n, param in previous_trainable_params.items():
+                new_param = trainer.model.get_parameter(n)
+                self.assertFalse(torch.equal(param, new_param), f"Parameter {n} has not changed.")
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
@@ -50,8 +50,8 @@ class GRPOConfig(TrainingArguments):
         max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
             Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left.
         num_generations (`int` or `None`, *optional*, defaults to `8`):
-            Number of generations per prompt to sample. The global batch size (num_processes * per_device_batch_size)
-            must be divisible by this value.
+            Number of generations per prompt to sample. The effective batch size (num_processes *
+            per_device_batch_size * gradient_accumulation_steps) must be evenly divisible by this value.
         max_completion_length (`int` or `None`, *optional*, defaults to `256`):
             Maximum length of the generated completion.
         ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
@@ -205,8 +205,8 @@ class GRPOConfig(TrainingArguments):
     num_generations: Optional[int] = field(
         default=8,
         metadata={
-            "help": "Number of generations to sample. The global batch size (num_processes * per_device_batch_size) "
-            "must be divisible by this value."
+            "help": "Number of generations to sample. The effective batch size (num_processes * per_device_batch_size "
+            "* gradient_accumulation_steps) must be evenly divisible by this value."
         },
     )
     max_completion_length: Optional[int] = field(
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py