[sglang] Fix tool format and response position ids padding in AsyncSGLangRollout (#1475)

SwordFaith · web-flow · commit bc9062d74fc3 · 2025-05-11T08:01:36.000-07:00
### Checklist Before Starting - [x] Search for similar PR(s). ### What does this PR do? > Add one-line overview of what this PR aims to achieve or accomplish. Resolved the tool formatting issue: Previously, arguments were stored as strings, causing iterative addition of `\\` due to multiple calls to `json.dumps`. Fixed the `response_position_ids` mismatch between `generate_sequences` and `generate_sequences_with_tools`: In the earlier implementation, `generate_sequences_with_tools` used zero padding for positions where `attention mask == 0`, which resulted in NaN values during the training phase. ### Specific Changes > List the specific changes. - Introduced a new schema, `OpenAIFunctionCallSchema`, to store converted tool calls. - Updated the `AsyncSGLangRollout` tool to skip non-dict type arguments instead of handling any string at the arguments position. - Aligned `response_position_ids` in `generate_sequences_with_tools` with the behavior of `generate_sequences`. - Enhanced tool descriptions to prevent misleading parse errors, as returning 0.0 caused the model to incorrectly modify answers. ### API > Demonstrate how the API changes if any. - Revise the `execute` interface of the tool to directly accept `dict[str, Any]` instead of a JSON string. ### Usage Example > Provide usage example(s) for easier usage. ```python # Add code snippet or script demonstrating how to use this ``` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluatuion results, etc. ### Additional Info. - **Issue Number**: Fixes issue # or discussion # if any. - **Training**: [Note which backend this PR will affect: FSDP, Megatron, both, or none] - **Inference**: [Note which backend this PR will affect: vLLM, SGLang, both, or none] ### Checklist Before Submitting - [x] Read the [Contribute Guide](https://github.com/volcengine/verl?tab=readme-ov-file#contribution-guide). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl?tab=readme-ov-file#code-linting-and-formatting). - [x] Add `[BREAKING]` to the PR title if it breaks any API. - [ ] Update the documentation about your changes in the [docs](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add CI test(s) if neccessary.
diff --git a/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml b/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml
@@ -5,7 +5,7 @@ tools:
       type: "function"
       function:
         name: "calc_gsm8k_reward"
-        description: "A tool for calculating the reward of gsm8k. (1.0 if your answer is correct, 0.0 if your answer is incorrect)"
+        description: "A tool for calculating the reward of gsm8k. (1.0 if parsed answer is correct, 0.0 if parsed answer is incorrect or not correctly parsed)"
         parameters:
           type: "object"
           properties:
diff --git a/verl/tools/base_tool.py b/verl/tools/base_tool.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple
+from typing import Any, Optional, Tuple
 from uuid import uuid4
 
 from .schemas import OpenAIFunctionToolSchema
@@ -52,7 +52,7 @@ async def create(self, instance_id: Optional[str] = None, **kwargs) -> str:
         else:
             return instance_id
 
-    async def execute(self, instance_id: str, parameters: str, **kwargs) -> Tuple[str, float, dict]:
+    async def execute(self, instance_id: str, parameters: dict[str, Any], **kwargs) -> Tuple[str, float, dict]:
         """Execute the tool.
 
         Args:
diff --git a/verl/tools/gsm8k_tool.py b/verl/tools/gsm8k_tool.py
@@ -13,10 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
 import logging
 import os
-from typing import Optional, Tuple
+from typing import Any, Optional, Tuple
 from uuid import uuid4
 
 from verl.utils.reward_score import gsm8k
@@ -74,26 +73,22 @@ async def create(self, instance_id: Optional[str] = None, ground_truth: Optional
         }
         return instance_id
 
-    async def execute(self, instance_id: str, parameters: str, **kwargs) -> Tuple[str, float, dict]:
-        try:
-            _parameters = json.loads(parameters)
-        except json.JSONDecodeError:
-            _parameters = {}
-        if isinstance(_parameters, dict):
-            answer = _parameters.get("answer", "")
-            if not isinstance(answer, str):
-                answer = str(answer)
-        else:
-            answer = ""
+    async def execute(self, instance_id: str, parameters: dict[str, Any], **kwargs) -> Tuple[str, float, dict]:
+        answer = parameters.get("answer", "")
+        if not isinstance(answer, str):
+            answer = str(answer)
+
         if answer.startswith("#### "):
             self._instance_dict[instance_id]["response"] = answer
         else:
             self._instance_dict[instance_id]["response"] = "#### " + answer
+
         reward = await self.calc_reward(instance_id)
         # penalty for non improved answer submission
         tool_reward = 0.0 if reward > self._instance_dict[instance_id]["reward"] else -0.05
         # update the reward
         self._instance_dict[instance_id]["reward"] = reward
+
         return f"Current parsed {answer=} {reward=}", tool_reward, {}
 
     async def calc_reward(self, instance_id: str, **kwargs) -> float:
diff --git a/verl/tools/schemas.py b/verl/tools/schemas.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Literal
+import json
+from typing import Any, Literal
 
 from pydantic import BaseModel
 
@@ -56,9 +57,31 @@ class OpenAIFunctionParsedSchema(BaseModel):
     arguments: str  # JSON string
 
 
+class OpenAIFunctionCallSchema(BaseModel):
+    """The parsed schema of a tool in OpenAI format."""
+
+    name: str
+    arguments: dict[str, Any]
+
+    @staticmethod
+    def from_openai_function_parsed_schema(parsed_schema: OpenAIFunctionParsedSchema) -> tuple["OpenAIFunctionCallSchema", bool]:
+        has_decode_error = False
+        try:
+            arguments = json.loads(parsed_schema.arguments)
+        except json.JSONDecodeError:
+            arguments = {}
+            has_decode_error = True
+        # If the arguments is not a dict, it means the arguments is not a valid JSON string
+        if not isinstance(arguments, dict):
+            arguments = {}
+            has_decode_error = True
+
+        return OpenAIFunctionCallSchema(name=parsed_schema.name, arguments=arguments), has_decode_error
+
+
 class OpenAIFunctionToolCall(BaseModel):
     """The tool call in OpenAI format."""
 
     id: str
     type: Literal["function"] = "function"
-    function: OpenAIFunctionParsedSchema
+    function: OpenAIFunctionCallSchema
diff --git a/verl/workers/rollout/sglang_rollout/async_sglang_rollout.py b/verl/workers/rollout/sglang_rollout/async_sglang_rollout.py
@@ -41,7 +41,7 @@
 from verl import DataProto
 from verl.third_party.sglang import parallel_state as sglang_ps
 from verl.tools.base_tool import BaseTool
-from verl.tools.schemas import OpenAIFunctionParsedSchema, OpenAIFunctionToolCall
+from verl.tools.schemas import OpenAIFunctionCallSchema, OpenAIFunctionParsedSchema, OpenAIFunctionToolCall
 from verl.utils.debug import GPUMemoryLogger
 from verl.utils.model import compute_position_id_with_mask
 from verl.utils.net_utils import is_ipv6
@@ -93,6 +93,7 @@ def __init__(
         """
         super().__init__()
         self.config = config
+        os.environ.setdefault("SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK", "true")
 
         tool_list = None
         if config.multi_turn.tool_config_path is not None:
@@ -216,6 +217,7 @@ def initialize_tools(tools_config) -> list:
         first_rank_in_node = self._tp_rank % tp_size_per_node == 0
 
         if first_rank_in_node:
+            rank = dist.get_rank()
             os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0"
             self._engine = Engine(
                 model_path=actor_module,
@@ -230,6 +232,16 @@ def initialize_tools(tools_config) -> list:
                 load_format=load_format,
                 dist_init_addr=dist_init_addr,
                 trust_remote_code=trust_remote_code,
+                # NOTE(linjunrong): add rank to prevent SGLang generate same port inside PortArgs.init_new
+                # when random.seed is being set during training
+                port=30000 + rank,
+                # NOTE(Chenyang): if you want to debug the SGLang engine output
+                # please set the following parameters
+                # Otherwise, it will make the engine run too slow
+                # log_level="INFO",
+                # log_requests=True,
+                # log_requests_level=2,
+                # max_running_requests=1,
             )
         else:
             self._engine = None
@@ -271,7 +283,7 @@ def update_sampling_params(self, **kwargs):
         for key, value in old_sampling_params_args.items():
             self.sampling_params[key] = value
 
-    @GPUMemoryLogger(role="sglang rollout", logger=logger)
+    @GPUMemoryLogger(role="sglang async rollout", logger=logger)
     @torch.no_grad()
     def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
         # if self.config.free_cache_engine:
@@ -508,13 +520,18 @@ async def _async_rollout_a_request(self, req: AsyncRolloutRequest, do_sample: bo
                         except AttributeError:
                             normed_content = content
                             tool_calls = []
-                        parsed_tool_calls = [
-                            OpenAIFunctionToolCall(
-                                id=str(tool_call.tool_index),
-                                function=OpenAIFunctionParsedSchema(name=tool_call.name, arguments=tool_call.parameters),
+                        parsed_tool_calls = []
+                        for tool_call in tool_calls:
+                            function, has_decode_error = OpenAIFunctionCallSchema.from_openai_function_parsed_schema(OpenAIFunctionParsedSchema(name=tool_call.name, arguments=tool_call.parameters))
+                            # Drop the tool call if its arguments has decode error
+                            if has_decode_error:
+                                continue
+                            parsed_tool_calls.append(
+                                OpenAIFunctionToolCall(
+                                    id=str(tool_call.tool_index),
+                                    function=function,
+                                )
                             )
-                            for tool_call in tool_calls
-                        ]
                         if len(parsed_tool_calls) > 0:
                             _req.add_assistant_message(
                                 self.tokenizer,
@@ -550,6 +567,7 @@ async def calc_reward_and_release_fn(name: str, tool: BaseTool):
 
         return _req
 
+    @GPUMemoryLogger(role="sglang async rollout", logger=logger)
     @torch.no_grad()
     def generate_sequences_with_tools(self, prompts: DataProto, **kwargs) -> DataProto:
         # Async rollout with tools support
@@ -632,9 +650,10 @@ def generate_sequences_with_tools(self, prompts: DataProto, **kwargs) -> DataPro
         prompt_position_ids = pad_sequence(prompt_position_ids, batch_first=True, padding_value=0, padding_side="left")
         if prompt_position_ids.shape[1] < self.config.prompt_length:
             prompt_position_ids = pad_sequence_to_length(prompt_position_ids, self.config.prompt_length, 0, left_pad=True)
-        response_position_ids = pad_sequence(response_position_ids, batch_first=True, padding_value=0)
-        if response_position_ids.shape[1] < self.config.response_length:
-            response_position_ids = pad_sequence_to_length(response_position_ids, self.config.response_length, 0)
+        response_length = response_ids.size(1)
+        delta_position_id = torch.arange(1, response_length + 1, device=response_ids.device)
+        delta_position_id = delta_position_id.unsqueeze(0).repeat(len(sorted_output_req_list), 1)
+        response_position_ids = prompt_position_ids[:, -1:] + delta_position_id
         prompt_loss_mask = pad_sequence(prompt_loss_mask, batch_first=True, padding_value=0, padding_side="left")
         if prompt_loss_mask.shape[1] < self.config.prompt_length:
             prompt_loss_mask = pad_sequence_to_length(prompt_loss_mask, self.config.prompt_length, 0, left_pad=True)
@@ -660,6 +679,10 @@ def generate_sequences_with_tools(self, prompts: DataProto, **kwargs) -> DataPro
             batch_size=len(sorted_output_req_list),
         )
 
+        # free cache engine
+        if self.config.free_cache_engine and self._engine is not None and self._tp_rank == 0:
+            self._engine.tokenizer_manager.flush_cache()
+
         return DataProto(batch=batch, non_tensor_batch={"messages": np.array(messages), "reward_scores": np.array(reward_scores)})
 
     def _preprocess_prompt_to_async_rollout_requests(self, prompts: DataProto, n: int) -> list[AsyncRolloutRequest]: