Merge branch 'main' into grpo_config_extend

qgallouedec · web-flow · commit 9510b3683760 · 2025-04-04T22:02:19.000-07:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.9.7
+    rev: v0.11.3
     hooks:
       - id: ruff
         types_or: [ python, pyi ]
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
@@ -115,14 +115,13 @@ When  \\( \mu = 1 \\) (default in TRL), the clipped surrogate objective simplifi
 ## Logged metrics
 
 - `num_tokens`: The total number of tokens processed so far, including both prompts and completions.
-- `mean_completion_length`: The average length of generated completions.
-- `min_completion_length`: The maximum length of generated completions.
-- `max_completion_length`: The minimun length of generated completions.
-- `mean_terminated_completion_length`: The average length of generated completions that terminate with EOS.
-- `min_terminated_completion_length`: The maximum length of generated completions that terminate with EOS.
-- `max_terminated_completion_length`: The minimun length of generated completions that terminate with EOS.
-- `max_terminated_completion_length`: The minimun length of generated completions that terminate with EOS.
-- `clipped_completions_ratio` :  The ratio of trucated (clipped) completions.
+- `completions/mean_length`: The average length of generated completions.
+- `completions/min_length`: The minimun length of generated completions.
+- `completions/max_length`: The maximum length of generated completions.
+- `completions/mean_terminated_length`: The average length of generated completions that terminate with EOS.
+- `completions/min_terminated_length`: The minimun length of generated completions that terminate with EOS.
+- `completions/max_terminated_length`: The maximum length of generated completions that terminate with EOS.
+- `completions/clipped_ratio` :  The ratio of truncated (clipped) completions.
 - `reward/{reward_func_name}/mean`: The average reward from a specific reward function.
 - `reward/{reward_func_name}/std`: The standard deviation of the reward from a specific reward function.
 - `reward`: The overall average reward after applying reward weights.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.ruff]
-target-version = "py37"
+target-version = "py39"
 line-length = 119
 
 [tool.ruff.lint]
diff --git a/setup.py b/setup.py
@@ -81,7 +81,7 @@
     "deepspeed": ["deepspeed>=0.14.4"],
     "diffusers": ["diffusers>=0.18.0"],
     "judges": ["openai>=1.23.2", "llm-blender>=0.0.2"],
-    "liger": ["liger-kernel>=0.5.5"],
+    "liger": ["liger-kernel>=0.5.6"],
     "mergekit": ["mergekit>=0.0.5.1"],
     "peft": ["peft>=0.8.0"],
     "quantization": ["bitsandbytes"],
diff --git a/tests/slow/test_grpo_slow.py b/tests/slow/test_grpo_slow.py
@@ -0,0 +1,81 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+import torch
+from accelerate.utils.memory import release_memory
+from datasets import load_dataset
+from parameterized import parameterized
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.testing_utils import require_liger_kernel, require_torch_accelerator
+
+from trl import GRPOConfig, GRPOTrainer
+
+from .testing_constants import MODELS_TO_TEST
+
+
+@require_torch_accelerator
+class GRPOTrainerSlowTester(unittest.TestCase):
+    def setUp(self):
+        self.train_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+        self.eval_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="test")
+        self.max_length = 128
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    @parameterized.expand(MODELS_TO_TEST)
+    @require_liger_kernel
+    def test_training_with_liger_grpo_loss(self, model_name):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = GRPOConfig(
+                output_dir=tmp_dir,
+                per_device_train_batch_size=3,
+                num_generations=3,
+                use_liger_loss=True,
+                max_completion_length=self.max_length,
+                report_to="none",
+                logging_strategy="no",
+            )
+
+            model = AutoModelForCausalLM.from_pretrained(model_name)
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
+
+            trainer = GRPOTrainer(
+                model=model,
+                reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+                args=training_args,
+                train_dataset=self.train_dataset,
+                eval_dataset=self.eval_dataset,
+                processing_class=tokenizer,
+            )
+            from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
+
+            assert isinstance(trainer.liger_grpo_loss, LigerFusedLinearGRPOLoss)
+
+            previous_trainable_params = {n: param.clone() for n, param in model.named_parameters()}
+
+            trainer.train()
+
+            for n, param in previous_trainable_params.items():
+                new_param = model.get_parameter(n)
+                self.assertFalse(torch.equal(param, new_param), f"Parameter {n} has not changed.")
+
+        release_memory(model, trainer)
diff --git a/tests/slow/test_sft_slow.py b/tests/slow/test_sft_slow.py
@@ -106,7 +106,6 @@ def test_sft_trainer_transformers(self, model_name, packing):
 
             model = AutoModelForCausalLM.from_pretrained(model_name)
             tokenizer = AutoTokenizer.from_pretrained(model_name)
-            tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
 
             trainer = SFTTrainer(
                 model,
@@ -141,7 +140,6 @@ def test_sft_trainer_peft(self, model_name, packing):
 
             model = AutoModelForCausalLM.from_pretrained(model_name)
             tokenizer = AutoTokenizer.from_pretrained(model_name)
-            tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
 
             trainer = SFTTrainer(
                 model,
@@ -178,7 +176,6 @@ def test_sft_trainer_transformers_mp(self, model_name, packing):
 
             model = AutoModelForCausalLM.from_pretrained(model_name)
             tokenizer = AutoTokenizer.from_pretrained(model_name)
-            tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
 
             trainer = SFTTrainer(
                 model,
@@ -214,7 +211,6 @@ def test_sft_trainer_transformers_mp_gc(self, model_name, packing, gradient_chec
 
             model = AutoModelForCausalLM.from_pretrained(model_name)
             tokenizer = AutoTokenizer.from_pretrained(model_name)
-            tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
 
             trainer = SFTTrainer(
                 model,
@@ -251,7 +247,6 @@ def test_sft_trainer_transformers_mp_gc_peft(self, model_name, packing, gradient
 
             model = AutoModelForCausalLM.from_pretrained(model_name)
             tokenizer = AutoTokenizer.from_pretrained(model_name)
-            tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
 
             trainer = SFTTrainer(
                 model,
@@ -295,7 +290,6 @@ def test_sft_trainer_transformers_mp_gc_device_map(
 
             model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map)
             tokenizer = AutoTokenizer.from_pretrained(model_name)
-            tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
 
             trainer = SFTTrainer(
                 model,
@@ -335,7 +329,6 @@ def test_sft_trainer_transformers_mp_gc_peft_qlora(self, model_name, packing, gr
 
             model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)
             tokenizer = AutoTokenizer.from_pretrained(model_name)
-            tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
 
             trainer = SFTTrainer(
                 model,
@@ -381,7 +374,6 @@ def test_sft_trainer_with_chat_format_qlora(self, model_name, packing):
 
             if tokenizer.chat_template is None:
                 model, tokenizer = setup_chat_format(model, tokenizer)
-            tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
 
             trainer = SFTTrainer(
                 model,
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -33,6 +33,7 @@
 
 from trl import SFTConfig, SFTTrainer
 from trl.trainer import ConstantLengthDataset, DataCollatorForCompletionOnlyLM
+from trl.trainer.sft_trainer import DataCollatorForLanguageModeling
 
 
 def formatting_prompts_func(example):
@@ -59,14 +60,41 @@ def formatting_prompts_func_batched(example):
     from PIL import Image as PILImage
 
 
+class TestDataCollatorForLanguageModeling(unittest.TestCase):
+    def test_collate_padding(self):
+        collator = DataCollatorForLanguageModeling(pad_token_id=0)
+        examples = [{"input_ids": [1, 2, 3]}, {"input_ids": [4, 5]}]
+        output = collator(examples)
+
+        expected_input_ids = torch.tensor([[1, 2, 3], [4, 5, 0]])
+        expected_attention_mask = torch.tensor([[1, 1, 1], [1, 1, 0]])
+        expected_labels = torch.tensor([[1, 2, 3], [4, 5, -100]])
+
+        self.assertEqual(output["input_ids"].tolist(), expected_input_ids.tolist())
+        self.assertEqual(output["attention_mask"].tolist(), expected_attention_mask.tolist())
+        self.assertEqual(output["labels"].tolist(), expected_labels.tolist())
+
+    def test_collate_no_padding(self):
+        collator = DataCollatorForLanguageModeling(pad_token_id=0)
+        examples = [{"input_ids": [1, 2, 3]}, {"input_ids": [4, 5, 6]}]
+        output = collator(examples)
+
+        expected_input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]])
+        expected_attention_mask = torch.tensor([[1, 1, 1], [1, 1, 1]])
+        expected_labels = torch.tensor([[1, 2, 3], [4, 5, 6]])
+
+        self.assertEqual(output["input_ids"].tolist(), expected_input_ids.tolist())
+        self.assertEqual(output["attention_mask"].tolist(), expected_attention_mask.tolist())
+        self.assertEqual(output["labels"].tolist(), expected_labels.tolist())
+
+
 class SFTTrainerTester(unittest.TestCase):
     r""" """
 
     def setUp(self):
         self.model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
         self.model = AutoModelForCausalLM.from_pretrained(self.model_id)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
-        self.tokenizer.pad_token = self.tokenizer.eos_token
         self.dummy_dataset = Dataset.from_dict(
             {
                 "question": [
diff --git a/trl/data_utils.py b/trl/data_utils.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 
 import functools
-from typing import Any, Callable, Optional, Sequence, TypeVar, Union
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, TypeVar, Union
 
 import numpy as np
 import pyarrow as pa
diff --git a/trl/extras/profiling.py b/trl/extras/profiling.py
@@ -15,7 +15,7 @@
 import contextlib
 import functools
 import time
-from typing import Generator
+from collections.abc import Generator
 
 from transformers import Trainer, is_wandb_available
 
diff --git a/trl/extras/vllm_client.py b/trl/extras/vllm_client.py
@@ -140,7 +140,7 @@ def generate(
         min_p: float = 0.0,
         max_tokens: int = 16,
         guided_decoding_regex: Optional[str] = None,
-    ) -> list[list[str]]:
+    ) -> list[list[int]]:
         """
         Generates model completions for the provided prompts.
 
diff --git a/trl/scripts/sft.py b/trl/scripts/sft.py
@@ -98,8 +98,6 @@ def main(script_args, training_args, model_args):
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, use_fast=True
     )
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
 
     ################
     # Dataset
diff --git a/trl/scripts/utils.py b/trl/scripts/utils.py
@@ -18,8 +18,9 @@
 import os
 import subprocess
 import sys
+from collections.abc import Iterable
 from dataclasses import dataclass, field
-from typing import Iterable, Optional, Union
+from typing import Optional, Union
 
 import yaml
 from transformers import HfArgumentParser
diff --git a/trl/scripts/vllm_serve.py b/trl/scripts/vllm_serve.py
@@ -15,8 +15,9 @@
 import argparse
 import logging
 import os
+from collections.abc import Sequence
 from dataclasses import dataclass, field
-from typing import Optional, Sequence
+from typing import Optional
 
 import torch
 import torch.distributed as dist
diff --git a/trl/trainer/callbacks.py b/trl/trainer/callbacks.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import os
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 import pandas as pd
 import torch
@@ -210,7 +210,7 @@ def on_train_end(self, args, state, control, **kwargs):
 
 
 def _win_rate_completions_df(
-    state: TrainerState, prompts: List[str], completions: List[str], winner_indices: List[str]
+    state: TrainerState, prompts: list[str], completions: list[str], winner_indices: list[str]
 ) -> pd.DataFrame:
     global_step = [str(state.global_step)] * len(prompts)
     data = list(zip(global_step, prompts, completions, winner_indices))
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
@@ -127,6 +127,8 @@ class GRPOConfig(TrainingArguments):
             τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how
             frequently the current policy is synchronized with the reference policy. To use this parameter, you must
             set `sync_ref_model=True`.
+        use_liger_loss (`bool`, *optional*, defaults to `False`):
+            Whether to use the Liger GRPO loss.
 
         > Parameters that control the logging
 
@@ -135,6 +137,9 @@ class GRPOConfig(TrainingArguments):
             installed, it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`.
         num_completions_to_print (`int` or `None`, *optional*, defaults to `None`):
             Number of completions to print with `rich`. If `None`, all completions are logged.
+        wandb_log_unique_prompts (`bool`, *optional*, defaults to `False`):
+            Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, all
+            prompts are logged.
     """
     _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
 
@@ -319,6 +324,10 @@ class GRPOConfig(TrainingArguments):
             "synchronized with the reference policy. To use this parameter, you must set `sync_ref_model=True`."
         },
     )
+    use_liger_loss: bool = field(
+        default=False,
+        metadata={"help": "Whether to use the Liger GRPO loss."},
+    )
 
     # Parameters that control the logging
     log_completions: bool = field(
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
diff --git a/trl/trainer/sft_config.py b/trl/trainer/sft_config.py
diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py