huggingface · ryang-max · Feb 18, 2025 · Feb 19, 2025 · Feb 19, 2025 · Feb 20, 2025
diff --git a/trl/import_utils.py b/trl/import_utils.py
@@ -39,6 +39,7 @@
 _vllm_available = _is_package_available("vllm")
 _vllm_ascend_available = _is_package_available("vllm_ascend")
 _joblib_available = _is_package_available("joblib")
+_sglang_available = _is_package_available("sglang")
 
 
 def is_deepspeed_available() -> bool:
@@ -97,6 +98,10 @@ def is_joblib_available() -> bool:
     return _joblib_available
 
 
+def is_sglang_available() -> bool:
+    return _sglang_available
+
+
 class _LazyModule(ModuleType):
     """
     Module class that surfaces all objects but only performs associated imports when the objects are requested.

diff --git a/trl/scripts/grpo_test/grpo_sgl_test.py b/trl/scripts/grpo_test/grpo_sgl_test.py
@@ -0,0 +1,56 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from datasets import load_dataset
+
+from trl import GRPOConfig, GRPOTrainer
+
+
+
+dataset = load_dataset("trl-lib/tldr", split="train[:1%]")
+
+checkpoint_dir = os.path.join("/sgl-workspace/ryang/trl", "checkpoints/sgl")
+os.makedirs(checkpoint_dir, exist_ok=True)
+
+
+# Define the reward function, which rewards completions that are close to 20 characters
+def reward_len(completions, **kwargs):
+    return [-abs(20 - len(completion)) for completion in completions]
+
+
+training_args = GRPOConfig(
+    output_dir=os.path.join(checkpoint_dir, "Qwen2.5_output"),
+    logging_steps=10,
+    # report_to="wandb",
+    # use_vllm=True,
+    use_sglang=True,
+    sglang_device="cuda:1",
+    sglang_gpu_memory_utilization=0.9,
+    sglang_server_url="http://127.0.0.1:30000",
+)
+
+
+trainer = GRPOTrainer(
+    model="Qwen/Qwen2-0.5B-Instruct",
+    reward_funcs=reward_len,
+    args=training_args,
+    train_dataset=dataset,
+)
+
+training_args.checkpoint_path = checkpoint_dir  # Set the checkpoint path for later use
+
+
+trainer.train()
diff --git a/trl/scripts/grpo_test/grpo_sgl_test.yaml b/trl/scripts/grpo_test/grpo_sgl_test.yaml
@@ -0,0 +1,17 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 3
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+main_process_port: 29600
diff --git a/trl/scripts/grpo_test/run.sh b/trl/scripts/grpo_test/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=5,6,7
+export PYTHONPATH="/sgl-workspace/ryang/trl:$PYTHONPATH"
+accelerate launch --config_file=trl/scripts/grpo_test/grpo_sgl_test.yaml trl/scripts/grpo_test/grpo_sgl_test.py
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
@@ -96,12 +96,24 @@ class GRPOConfig(TrainingArguments):
             timeout, a `ConnectionError` is raised.
         vllm_guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`):
             Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled.
+            Regex for vLLM guided decoding. If `None`, guided decoding is disabled.
+
+        > Parameters that control generation acceleration powered by SGLang
+
+        use_sglang (`bool`, *optional*, defaults to `False`):
+            Whether to use SGLang for generating completions. If set to `True`, a SGLang server must be running.
+        sglang_server_url (`str` or `None`, *optional*, defaults to `None`):
+            The URL of the SGLang server (e.g. "http://localhost:30033"). Required if `use_sglang` is `True`.
+        sglang_device (`str`, *optional*, defaults to `"cuda:1"`):
+            GPU device to be used for SGLang generation if launching from this code. This is optional if the server is
+            managed externally.
 
         > Parameters that control the training
 
         learning_rate (`float`, *optional*, defaults to `1e-6`):
             Initial learning rate for [`AdamW`] optimizer. The default value replaces that of
             [`~transformers.TrainingArguments`].
+            Initial learning rate.
         beta (`float`, *optional*, defaults to `0.04`):
             KL coefficient. If `0.0`, the reference model is not loaded, reducing memory usage and improving training
             speed, but may be numerically unstable for long training runs.
@@ -187,13 +199,20 @@ class GRPOConfig(TrainingArguments):
             "it prevents the model from generating different logprobs for the same input."
         },
     )
+    disable_dropout: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to disable dropout in the model. This is useful for training with a reference model, as "
+            "it prevents the model from generating different logprobs for the same input."
+        },
+    )
 
     # Parameters that control the data preprocessing
     # The default value remove_unused_columns is overwritten from the parent class, because in GRPO we usually rely on
     # additional columns to compute the reward
     remove_unused_columns: Optional[bool] = field(
         default=False,
-        metadata={
+         metadata={
             "help": "Whether to only keep the column 'prompt' in the dataset. If you use a custom reward function "
             "that requires any column other than 'prompts' and 'completions', you should keep this to `False`."
         },
@@ -211,6 +230,10 @@ class GRPOConfig(TrainingArguments):
             "* gradient_accumulation_steps) must be evenly divisible by this value."
         },
     )
+    temperature: Optional[float] = field(
+        default=0.9,
+        metadata={"help": "Temperature for sampling completions."},
+    )
     max_completion_length: Optional[int] = field(
         default=256,
         metadata={"help": "Maximum length of the generated completion."},
@@ -296,6 +319,34 @@ class GRPOConfig(TrainingArguments):
         metadata={"help": "Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled."},
     )
 
+    # When running the trainer, set the following command-line arguments (or JSON configuration) so that SGLang is used:
+    # • --use_sglang True
+    # • --sglang_server_url "http://localhost:30033"
+    # • Optionally, --sglang_device "cuda:1" if you wish to assign a specific GPU.
+    # Parameters for generation acceleration powered by SGLang
+    use_sglang: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether to use SGLang for generating completions. If True, a SGLang server must be running."
+        },
+    )
+    sglang_server_url: Optional[str] = field(
+        default="http://localhost:32232",
+        metadata={
+            "help": "The URL of the SGLang server (e.g., 'http://localhost:32232'). Required if use_sglang is True."
+        },
+    )
+    sglang_device: Optional[str] = field(
+        default="auto",
+        metadata={
+            "help": "The GPU device to be used for SGLang generation if launching internally. Optional if the server is managed externally."
+        },
+    )
+    sglang_gpu_memory_utilization: float = field(
+        default=0.9,
+        metadata={"help": "Ratio of GPU memory reserved for sglang generation."},
+    )
+
     # Parameters that control the training
     learning_rate: float = field(
         default=1e-6,
@@ -366,6 +417,39 @@ class GRPOConfig(TrainingArguments):
             "a good practice for training stability."
         },
     )
+    scale_rewards: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to scale the rewards by dividing them by their standard deviation. If `True` (default), "
+            "the rewards are normalized by the standard deviation, ensuring they have unit variance. If `False`, no "
+            "scaling is applied. The Dr. GRPO paper recommends not scaling the rewards, as scaling by the standard "
+            "deviation introduces a question-level difficulty bias."
+        },
+    )
+    loss_type: str = field(
+        default="bnpo",
+        metadata={
+            "help": "Specifies the loss formulation to use. Supported values are `grpo`, `bnpo`, and `dr_grpo`. "
+            "`'grpo'`: Aggregates token-level losses by normalizing over sequence length. Not recommended due to "
+            "length bias—this approach tends to prefer shorter completions with positive advantages and longer ones "
+            "with negative advantages. "
+            "`'bnpo'`: Aggregates token-level losses by normalizing number of active token in the local batch. "
+            "Note that normalization is performed over the local batch only, so results may slightly vary depending "
+            "on the local batch size, despite a constant effective batch size. When using "
+            "`per_device_train_batch_size==1`, the loss is equivalent to the GRPO loss. "
+            "`'dr_grpo'`: Aggregates token-level losses by normalizing with a global constant. This method was "
+            "introduced in the Dr. GRPO paper to eliminate length bias. The value of the constant corresponds to "
+            "`max_completion_length`."
+        },
+    )
+    mask_truncated_completions: bool = field(
+        default=False,
+        metadata={
+            "help": "When enabled, truncated completions are excluded from the loss calculation, preventing them from "
+            "being incorrectly penalized and introducing noise during training. According to the DAPO paper, this is "
+            "a good practice for training stability."
+        },
+    )
     sync_ref_model: bool = field(
         default=False,
         metadata={
@@ -411,4 +495,4 @@ class GRPOConfig(TrainingArguments):
             "help": "Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, "
             "all prompts are logged."
         },
-    )
+    )