🎲 Add support for additional generation kwargs in GRPO Trainer (#2989)

nopepper · Robert Veres · qgallouedec · web-flow · commit 6a02c697891e · 2025-03-05T09:58:00.000+01:00
* Add support for additional generation kwargs in GRPO Trainer

- Extend GRPOConfig to support additional generation kwargs
- Update GRPOTrainer to incorporate additional generation parameters
- Add tests for training with additional generation kwargs for both standard and vLLM modes

* Add missing vllm_gpu_memory_utilization=0.5

* 🔧 Refactor GRPO generation parameters and configuration

- Restructure GRPOConfig to separate generation parameters
- Add support for top_p, top_k, min_p, repetition_penalty, and length_penalty
- Remove additional_generation_kwargs in favor of explicit parameters
- Update GRPOTrainer to use new generation parameter configuration

* Update tests

* Remove length_penalty and fix tests

* Update defaults and docs

- Change temperature type from Optional[float] to float
- Set default top_p to 1.0 instead of None
- Simplify parameter descriptions by removing redundant "if set to None" text
- Maintain consistent type hints and default values for generation parameters

* GRPO remove optional type hint for temperature parameter

* Remove length_penalty from sampling_kwargs dict in GRPOTrainer

* some refactoring

* top k None support

* change value of in test to amke them work

---------

Co-authored-by: Robert Veres &lt;robert.veres@languagetool.org&gt;
Co-authored-by: Quentin Gallouédec &lt;45557362+qgallouedec@users.noreply.github.com&gt;
Co-authored-by: Quentin Gallouédec &lt;gallouedec.quentin@gmail.com&gt;
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
@@ -780,6 +780,7 @@ def test_training_vllm_guided_decoding(self):
                 report_to="none",
                 use_vllm=True,
                 vllm_device="cuda:0",  # will raise a warning, but allows this test to work with only one GPU
+                vllm_gpu_memory_utilization=0.5,  # reduce since because we use the same device for training and vllm
                 vllm_guided_decoding_regex=r"<reasoning>\n.*\n</reasoning>\n<answer>\n.*\n</answer>",
             )
             trainer = GRPOTrainer(
@@ -799,3 +800,80 @@ def test_training_vllm_guided_decoding(self):
             for n, param in previous_trainable_params.items():
                 new_param = trainer.model.get_parameter(n)
                 self.assertFalse(torch.equal(param, new_param), f"Parameter {n} has not changed.")
+
+    def test_training_with_additional_generation_kwargs(self):
+        """Test that training works with additional generation kwargs."""
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = GRPOConfig(
+                output_dir=tmp_dir,
+                learning_rate=0.1,  # increase the learning rate to speed up the test
+                per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+                num_generations=3,  # reduce the number of generations to reduce memory usage
+                max_completion_length=32,  # reduce the completion length to reduce memory usage
+                report_to="none",
+                top_p=0.9,
+                top_k=10,
+                min_p=0.01,
+                repetition_penalty=1.1,
+            )
+
+            trainer = GRPOTrainer(
+                model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+                reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+                args=training_args,
+                train_dataset=dataset,
+            )
+
+            previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+            trainer.train()
+
+            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+
+            # Check that the params have changed
+            for n, param in previous_trainable_params.items():
+                new_param = trainer.model.get_parameter(n)
+                self.assertFalse(torch.equal(param, new_param), f"Parameter {n} has not changed.")
+
+    @unittest.skipIf(not is_vllm_available(), "vLLM is not available")
+    @require_torch_accelerator
+    def test_training_vllm_with_additional_generation_kwargs(self):
+        """Test that training works with vLLM and additional generation kwargs."""
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = GRPOConfig(
+                output_dir=tmp_dir,
+                learning_rate=0.1,  # increase the learning rate to speed up the test
+                per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+                num_generations=3,  # reduce the number of generations to reduce memory usage
+                max_completion_length=32,  # reduce the completion length to reduce memory usage
+                report_to="none",
+                use_vllm=True,
+                vllm_device="cuda:0",  # will raise a warning, but allows this test to work with only one GPU
+                vllm_gpu_memory_utilization=0.5,  # reduce since because we use the same device for training and vllm
+                top_p=0.9,
+                top_k=10,
+                min_p=0.01,
+                repetition_penalty=1.1,
+            )
+
+            trainer = GRPOTrainer(
+                model="Qwen/Qwen2.5-0.5B-Instruct",  # tiny model is too small for vLLM
+                reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+                args=training_args,
+                train_dataset=dataset,
+            )
+
+            previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+            trainer.train()
+
+            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+
+            # Check that the params have changed
+            for n, param in previous_trainable_params.items():
+                new_param = trainer.model.get_parameter(n)
+                self.assertFalse(torch.equal(param, new_param), f"Parameter {n} has not changed.")
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
@@ -47,8 +47,6 @@ class GRPOConfig(TrainingArguments):
         num_generations (`int` or `None`, *optional*, defaults to `8`):
             Number of generations per prompt to sample. The global batch size (num_processes * per_device_batch_size)
             must be divisible by this value.
-        temperature (`float`, *optional*, defaults to `0.9`):
-            Temperature for sampling. The higher the temperature, the more random the completions.
         max_completion_length (`int` or `None`, *optional*, defaults to `256`):
             Maximum length of the generated completion.
         ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
@@ -57,6 +55,24 @@ class GRPOConfig(TrainingArguments):
             capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
             with vLLM generation.
 
+        > Parameters that control generation
+
+        temperature (`float`, defaults to `0.9`):
+            Temperature for sampling. The higher the temperature, the more random the completions.
+        top_p (`float`, *optional*, defaults to `1.0`):
+            Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to
+            `1.0` to consider all tokens.
+        top_k (`int` or `None`, *optional*, defaults to `50`):
+            Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is
+            disabled.
+        min_p (`float` or `None`, *optional*, defaults to `None`):
+            Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
+            value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range.
+        repetition_penalty (`float`, *optional*, defaults to `1.0`):
+            Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far.
+            Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat
+            tokens.
+
         > Parameters that control generation acceleration powered by vLLM
 
         use_vllm (`bool`, *optional*, defaults to `False`):
@@ -115,6 +131,7 @@ class GRPOConfig(TrainingArguments):
             set `sync_ref_model=True`.
 
         > Parameters that control the logging
+
         log_completions (`bool`, *optional*, defaults to `False`):
             Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is
             installed, it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`.
@@ -152,10 +169,6 @@ class GRPOConfig(TrainingArguments):
             "must be divisible by this value."
         },
     )
-    temperature: Optional[float] = field(
-        default=0.9,
-        metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."},
-    )
     max_completion_length: Optional[int] = field(
         default=256,
         metadata={"help": "Maximum length of the generated completion."},
@@ -170,6 +183,41 @@ class GRPOConfig(TrainingArguments):
         },
     )
 
+    # Parameters that control generation
+    temperature: float = field(
+        default=0.9,
+        metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."},
+    )
+    top_p: float = field(
+        default=1.0,
+        metadata={
+            "help": "Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. "
+            "Set to 1.0 to consider all tokens."
+        },
+    )
+    top_k: Optional[int] = field(
+        default=50,
+        metadata={
+            "help": "Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, "
+            "top-k-filtering is disabled."
+        },
+    )
+    min_p: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "Minimum token probability, which will be scaled by the probability of the most likely token. It "
+            "must be a value between 0.0 and 1.0. Typical values are in the 0.01-0.2 range."
+        },
+    )
+    repetition_penalty: float = field(
+        default=1.0,
+        metadata={
+            "help": "Float that penalizes new tokens based on whether they appear in the prompt and the generated "
+            "text so far. Values > 1.0 encourage the model to use new tokens, while values < 1.0 encourage the model "
+            "to repeat tokens."
+        },
+    )
+
     # Parameters that control generation acceleration powered by vLLM
     use_vllm: Optional[bool] = field(
         default=False,
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
@@ -519,10 +519,14 @@ def new_group_context():
 
                 # Sampling parameters
                 self.sampling_params = SamplingParams(
-                    temperature=args.temperature,
                     max_tokens=self.max_completion_length,
                     guided_decoding=guided_decoding,
                     n=args.num_generations,
+                    temperature=args.temperature,
+                    top_p=args.top_p,
+                    top_k=-1 if args.top_k is None else args.top_k,
+                    min_p=0.0 if args.min_p is None else args.min_p,
+                    repetition_penalty=args.repetition_penalty,
                 )
 
             self._last_loaded_step = 0  # tag to avoid useless loading during grad accumulation
@@ -535,8 +539,12 @@ def new_group_context():
             self.generation_config = GenerationConfig(
                 max_new_tokens=self.max_completion_length,
                 do_sample=True,
-                temperature=args.temperature,
                 pad_token_id=processing_class.pad_token_id,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                top_k=args.top_k,
+                min_p=args.min_p,
+                repetition_penalty=args.repetition_penalty,
             )
 
         # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the