❤️‍🩹 [CI] fix transformers dev CI failure (huggingface#3176)

kashif · qgallouedec · web-flow · commit 85e24bcbcd12 · 2025-03-29T18:39:40.000-07:00
Co-authored-by: Quentin Gallouédec &lt;45557362+qgallouedec@users.noreply.github.com&gt;
diff --git a/tests/test_online_dpo_trainer.py b/tests/test_online_dpo_trainer.py
@@ -244,7 +244,7 @@ def test_training_with_judge(self, config_name):
     @require_torch_accelerator
     @unittest.skipIf(not is_vllm_available(), "vllm is not available")
     def test_training_with_vllm(self, config_name):
-        model_id = "trl-internal-testing/small-Qwen2ForCausalLM-2.5"  # We neeed a bigger model
+        model_id = "trl-internal-testing/small-Qwen2ForCausalLM-2.5"  # We need a bigger model
         model = AutoModelForCausalLM.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         tokenizer.pad_token = tokenizer.eos_token
@@ -253,6 +253,7 @@ def test_training_with_vllm(self, config_name):
             training_args = OnlineDPOConfig(
                 output_dir=tmp_dir,
                 use_vllm=True,
+                gpu_memory_utilization=0.2,
                 report_to="none",
             )
             dummy_dataset = load_dataset("trl-internal-testing/zen", config_name)
diff --git a/trl/trainer/online_dpo_config.py b/trl/trainer/online_dpo_config.py
@@ -64,6 +64,8 @@ class OnlineDPOConfig(TrainingArguments):
             Whether to disable dropout in the model and reference model.
         use_vllm (`bool`, *optional*, defaults to `False`):
             Whether to use vLLM for generating completions. Requires vLLM to be installed (`pip install vllm`).
+        gpu_memory_utilization (`float`, *optional*, defaults to `0.55`):
+            The vLLM memory utilization. The default value is 0.55.
         ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
             This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
             improving generation speed. However, disabling this option allows training models that exceed the VRAM
@@ -144,6 +146,12 @@ class OnlineDPOConfig(TrainingArguments):
             "(`pip install vllm`)."
         },
     )
+    gpu_memory_utilization: Optional[float] = field(
+        default=0.55,
+        metadata={
+            "help": "The vLLM memory utilization. The default value is 0.55.",
+        },
+    )
     ds3_gather_for_generation: bool = field(
         default=True,
         metadata={
diff --git a/trl/trainer/online_dpo_trainer.py b/trl/trainer/online_dpo_trainer.py
@@ -269,7 +269,7 @@ def __init__(
             # space for them. Setting gpu_memory_utilization to 0.55 seems to work well in practice.
             self.llm = LLM(
                 model=model.name_or_path,
-                gpu_memory_utilization=0.55,
+                gpu_memory_utilization=args.gpu_memory_utilization,
                 dtype=torch.float32,
                 # When release by vLLM, we would be able to distribute the model on multiple GPUs
                 # See https://github.com/vllm-project/vllm/pull/12071
@@ -695,7 +695,9 @@ def training_step(
 
     # Same as Trainer._maybe_log_save_evaluate but log our metrics
     # start_time defaults to None to allow compatibility with transformers<=4.46
-    def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time=None):
+    def _maybe_log_save_evaluate(
+        self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time=None, learning_rate=None
+    ):
         if self.control.should_log and self.state.global_step > self._globalstep_last_logged:
             logs: dict[str, float] = {}
 
@@ -708,7 +710,10 @@ def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, igno
             logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
             if grad_norm is not None:
                 logs["grad_norm"] = grad_norm.detach().item() if isinstance(grad_norm, torch.Tensor) else grad_norm
-            logs["learning_rate"] = self._get_learning_rate()
+            if learning_rate is not None:
+                logs["learning_rate"] = learning_rate
+            else:
+                logs["learning_rate"] = self._get_learning_rate()
 
             # Add our metrics
             for key, val in self.stats.items():