🌡️ Fix temperature inconsistency in GRPO trainer (#3029)

Aladoro · qgallouedec · web-flow · commit 04f659737735 · 2025-03-11T10:36:42.000-07:00
* fix temperature inconsistency in GRPO trainer

* adding 1e-7 isn't necessary

* comment

---------

Co-authored-by: Quentin Gallouédec &lt;gallouedec.quentin@gmail.com&gt;
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
@@ -379,6 +379,7 @@ def data_collator(features):  # No data collation is needed in GRPO
         self.max_prompt_length = args.max_prompt_length
         self.max_completion_length = args.max_completion_length  # = |o_i| in the GRPO paper
         self.num_generations = args.num_generations  # = G in the GRPO paper
+        self.temperature = args.temperature
         self.use_vllm = args.use_vllm
 
         # Multi-step
@@ -658,7 +659,10 @@ def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep)
         # For transformers<=4.48, logits_to_keep argument isn't supported, so here we drop logits ourselves.
         # See https://github.com/huggingface/trl/issues/2770
         logits = logits[:, -logits_to_keep:]
-        return selective_log_softmax(logits, input_ids)  #  compute logprobs for the input tokens
+        # Divide logits by sampling temperature.
+        # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
+        logits = logits / self.temperature
+        return selective_log_softmax(logits, input_ids)  # compute logprobs for the input tokens
 
     @profiling_decorator
     def _move_model_to_vllm(self):