volcengine
diff --git a/‎examples/ppo_trainer/naive_chat_scheduler.py
Lines changed: 72 additions & 0 deletions b/‎examples/ppo_trainer/naive_chat_scheduler.py
Lines changed: 72 additions & 0 deletions
diff --git a/‎examples/ppo_trainer/run_qwen2-7b_seq_balance.sh
Lines changed: 10 additions & 0 deletions b/‎examples/ppo_trainer/run_qwen2-7b_seq_balance.sh
Lines changed: 10 additions & 0 deletions
diff --git a/‎recipe/dapo/src/dapo_ray_trainer.py
Lines changed: 1 addition & 1 deletion b/‎recipe/dapo/src/dapo_ray_trainer.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎recipe/dapo/src/main_dapo.py
Lines changed: 3 additions & 2 deletions b/‎recipe/dapo/src/main_dapo.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎recipe/prime/main_prime.py
Lines changed: 7 additions & 1 deletion b/‎recipe/prime/main_prime.py
Lines changed: 7 additions & 1 deletion
diff --git a/‎recipe/prime/prime_ray_trainer.py
Lines changed: 1 addition & 1 deletion b/‎recipe/prime/prime_ray_trainer.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/rollout/test_vllm_multi_turn.py
Lines changed: 149 additions & 0 deletions b/‎tests/rollout/test_vllm_multi_turn.py
Lines changed: 149 additions & 0 deletions
diff --git a/‎verl/single_controller/base/decorator.py
Lines changed: 11 additions & 0 deletions b/‎verl/single_controller/base/decorator.py
Lines changed: 11 additions & 0 deletions
diff --git a/‎verl/single_controller/base/register_center/ray.py
Lines changed: 10 additions & 0 deletions b/‎verl/single_controller/base/register_center/ray.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎verl/single_controller/base/worker.py
Lines changed: 7 additions & 0 deletions b/‎verl/single_controller/base/worker.py
Lines changed: 7 additions & 0 deletions
@@ -0,0 +1,72 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+from typing import Any, Dict, List
+
+from omegaconf import DictConfig
+from openai.types.chat.chat_completion import ChatCompletion
+
+from verl.protocol import DataProto
+from verl.workers.rollout.chat_scheduler import ChatCompletionScheduler
+
+
+class NaiveChatCompletionScheduler(ChatCompletionScheduler):
+
+    def __init__(self, config: DictConfig, model_path: str, server_addresses: List[str], max_cache_size: int = 10000):
+        super().__init__(config, model_path, server_addresses, max_cache_size)
+
+    async def generate_sequences(self, prompts: DataProto, **sampling_params) -> DataProto:
+        kwargs = dict(
+            n=self.config.n,
+            max_completion_tokens=self.config.response_length,
+            temperature=self.config.temperature,
+            top_p=self.config.top_p,
+        )
+
+        do_sample = prompts.meta_info.get('do_sample', True)
+        is_validate = prompts.meta_info.get('validate', False)
+        if not do_sample or is_validate:
+            kwargs["n"] = 1
+            kwargs["temperature"] = 0
+
+        kwargs.update(sampling_params)
+        print(f"[NaiveChatCompletionScheduler] generate_sequences sampling params: {kwargs}")
+
+        async def callback(completions: ChatCompletion, info: Dict[str, Any]):
+            info["all_completions"][info["index"]] = completions
+
+            # NOTE: we can call tools and resubmit chat completions here.
+            # call_tools(completions, info)
+            # await self.submit_chat_completions(callback2, ...)
+
+        tasks, all_completions = [], [None] * len(prompts)
+        for i, prompt in enumerate(prompts.non_tensor_batch["raw_prompt"]):
+            # raw_prompt: [{"role": "user", "content": ""}, ["role": "assistant", "content"], ...]
+            tasks.append(
+                asyncio.create_task(
+                    self.submit_chat_completions(
+                        callback=callback,
+                        callback_additional_info={
+                            "all_completions": all_completions,
+                            "index": i
+                        },
+                        model=self.model_name,
+                        messages=prompt,
+                        **kwargs,
+                    )))
+        await asyncio.gather(*tasks)
+
+        print("[NaiveChatCompletionScheduler] generate_sequences done")
+        # TODO: completions => DataProto
+        return all_completions
@@ -8,10 +8,18 @@ math_test_path=$HOME/data/math/test.parquet
 train_files="['$gsm8k_train_path', '$math_train_path']"
 test_files="['$gsm8k_test_path', '$math_test_path']"
 
+# For async rollout mode, dataset should return raw chat.
+rollout_mode="sync"
+if [ "$rollout_mode" = "async" ]; then
+    return_raw_chat="True"
+    chat_scheduler=examples.ppo_trainer.naive_chat_scheduler.NaiveChatCompletionScheduler
+fi
+
 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=gae \
     data.train_files="$train_files" \
     data.val_files="$test_files" \
+    data.return_raw_chat=$return_raw_chat \
     data.train_batch_size=4096 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
@@ -29,6 +37,8 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
     actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.mode=$rollout_mode \
+    actor_rollout_ref.rollout.chat_scheduler=$chat_scheduler \
     actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
     actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=24000 \
     critic.optim.lr=1e-5 \
 
@@ -40,7 +40,7 @@ class RayDAPOTrainer(RayPPOTrainer):
     Note that this trainer runs on the driver process on a single CPU/GPU node.
     """
 
-    def fit(self):
+    async def fit(self):
         """
         The training loop of PPO.
         The driver process only need to call the compute functions of the worker group through RPC to construct the PPO dataflow.
 
@@ -75,7 +75,8 @@ def run_ppo(config) -> None:
 
 @ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
 class TaskRunner:
-    def run(self, config):
+
+    async def run(self, config):
         # print initial config
         from pprint import pprint
 
@@ -200,7 +201,7 @@ def run(self, config):
             val_reward_fn=val_reward_fn,
         )
         trainer.init_workers()
-        trainer.fit()
+        await trainer.fit()
 
 
 if __name__ == "__main__":
 
@@ -29,6 +29,8 @@
 Note that we don't combine the main with ray_trainer as ray_trainer is used by other main.
 """
 
+import asyncio
+
 import hydra
 import ray
 
@@ -52,6 +54,10 @@ def run_prime(config, compute_score=None):
 
 @ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
 def main_task(config, compute_score=None):
+    asyncio.run(_main_task(config, compute_score))
+
+
+async def _main_task(config, compute_score=None):
     # print initial config
     from pprint import pprint
 
@@ -141,7 +147,7 @@ def main_task(config, compute_score=None):
         val_reward_fn=val_reward_fn,
     )
     trainer.init_workers()
-    trainer.fit()
+    await trainer.fit()
 
 
 if __name__ == "__main__":
 
@@ -331,7 +331,7 @@ def _load_checkpoint(self):
         if isinstance(self.train_dataloader.dataset, RLHFDataset):
             self.train_dataloader.dataset.resume_dataset_state()
 
-    def fit(self):
+    async def fit(self):
         """
         The training loop of PPO.
         The driver process only need to call the compute functions of the worker group through RPC to construct the PPO dataflow.
 
@@ -0,0 +1,149 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+from typing import Any, Dict
+
+import ray
+from omegaconf import OmegaConf
+from openai.types.chat.chat_completion import ChatCompletion
+
+from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
+from verl.single_controller.ray.base import Worker, create_colocated_worker_cls
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+from verl.workers.fsdp_async_workers import AsyncActorRolloutRefWorker, AsyncLLMManager
+from verl.workers.rollout.chat_scheduler import ChatCompletionScheduler
+
+
+async def test_vllm_multi_turn():
+    config = OmegaConf.load("verl/trainer/config/ppo_trainer.yaml")
+    model_path = "Qwen/Qwen2-7B-Instruct"
+    model_name = "/".join(model_path.split("/")[-2:])
+    config.actor_rollout_ref.model.path = model_path
+    config.actor_rollout_ref.rollout.mode = "async"
+    config.actor_rollout_ref.rollout.prompt_length = 4096
+    config.actor_rollout_ref.rollout.response_length = 4096
+
+    # =========================== 1. Create hybrid ActorRollout workers ===========================
+    ray.init(
+        runtime_env={
+            'env_vars': {
+                'TOKENIZERS_PARALLELISM': 'true',
+                'NCCL_DEBUG': 'WARN',
+                'VLLM_LOGGING_LEVEL': 'WARN',
+                'VLLM_USE_V1': '1',
+            }
+        })
+    role_worker_mapping = {
+        Role.ActorRollout: ray.remote(AsyncActorRolloutRefWorker),
+    }
+    global_pool_id = 'global_pool'
+    resource_pool_spec = {
+        global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+    }
+    mapping = {
+        Role.ActorRollout: global_pool_id,
+    }
+    resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+    resource_pool_manager.create_resource_pool()
+    resource_pool_to_cls = {pool: {} for pool in resource_pool_manager.resource_pool_dict.values()}
+
+    # create actor and rollout
+    resource_pool = resource_pool_manager.get_resource_pool(Role.ActorRollout)
+    actor_rollout_cls = RayClassWithInitArgs(cls=role_worker_mapping[Role.ActorRollout],
+                                             config=config.actor_rollout_ref,
+                                             role='actor_rollout')
+    resource_pool_to_cls[resource_pool]['actor_rollout'] = actor_rollout_cls
+
+    all_wg = {}
+    wg_dicts = []
+    for resource_pool, class_dict in resource_pool_to_cls.items():
+        worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict, worker_cls=Worker)
+        wg_dict = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=worker_dict_cls)
+        spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+        all_wg.update(spawn_wg)
+        wg_dicts.append(wg_dict)
+    actor_rollout_wg = all_wg['actor_rollout']
+    actor_rollout_wg.init_model()
+
+    # =========================== 2. Create AsyncLLMManager&ChatScheduler  ===========================
+    async_rollout_manager = AsyncLLMManager(
+        config=config.actor_rollout_ref,
+        worker_group=actor_rollout_wg,
+    )
+
+    async_chat_scheduler = ChatCompletionScheduler(
+        config=config.actor_rollout_ref.rollout,
+        model_path=config.actor_rollout_ref.model.path,
+        server_addresses=async_rollout_manager.server_addresses,
+    )
+
+    # =========================== 3. Multi turn rollout  ===========================
+    async def callback(completions: ChatCompletion, info: Dict[str, Any]):
+        messages, round = info["messages"], info["round"]
+        message = completions.choices[0].message
+        messages.append({"role": message.role, "content": message.content})
+        print(f"[round={round}] role: {message.role}, content: {message.content}")
+
+        extra_headers = {"x-request-id": completions.id}
+        if round == 0:
+            messages.append({"role": "user", "content": "What is your name?"})
+            await async_chat_scheduler.submit_chat_completions(
+                callback=callback,
+                callback_additional_info={
+                    "messages": messages,
+                    "round": 1
+                },
+                model=model_name,
+                messages=messages,
+                extra_headers=extra_headers,
+            )
+        elif round == 1:
+            messages.append({"role": "user", "content": "What is your favorite color?"})
+            await async_chat_scheduler.submit_chat_completions(
+                callback=callback,
+                callback_additional_info={
+                    "messages": messages,
+                    "round": 2
+                },
+                model=model_name,
+                messages=messages,
+                extra_headers=extra_headers,
+            )
+        else:
+            print("Done!")
+
+    messages = [{
+        "role": "user",
+        "content": "Let's play a role playing game. Your name is Bob, your favorite color is red."
+    }]
+    await async_chat_scheduler.submit_chat_completions(
+        callback=callback,
+        callback_additional_info={
+            "messages": messages,
+            "round": 0
+        },
+        model=model_name,
+        messages=messages,
+    )
+    assert len(messages) == 6
+    for round, message in enumerate(messages):
+        if round % 2 == 0:
+            assert message["role"] == "user"
+        else:
+            assert message["role"] == "assistant"
+
+
+if __name__ == "__main__":
+    asyncio.run(test_vllm_multi_turn())
@@ -37,6 +37,9 @@ class Dispatch(Enum):
     DP_COMPUTE_PROTO_WITH_FUNC = 10
     DP_COMPUTE_METRIC = 11
 
+    # This is a special dispatch mode for vllm ExternalRayDistributedExecutor
+    DIRECT_ROLLOUT_METHOD = 12
+
 
 class Execute(Enum):
     ALL = 0
@@ -65,6 +68,10 @@ def dispatch_one_to_all(worker_group, *args, **kwargs):
     return args, kwargs
 
 
+def dummy_direct_rollout_call(worker_group, *args, **kwargs):
+    raise NotImplementedError("Direct rollout call is forbidden.")
+
+
 def dispatch_all_to_all(worker_group, *args, **kwargs):
     return args, kwargs
 
@@ -356,6 +363,10 @@ def get_predefined_dispatch_fn(dispatch_mode):
             "collect_fn": collect_dp_compute_data_proto,
         },
         Dispatch.DP_COMPUTE_METRIC: {"dispatch_fn": dispatch_dp_compute_data_proto, "collect_fn": collect_dp_compute},
+        Dispatch.DIRECT_ROLLOUT_METHOD: {
+            "dispatch_fn": dummy_direct_rollout_call,
+            "collect_fn": dummy_direct_rollout_call,
+        },
     }
     return predefined_dispatch_mode_fn[dispatch_mode]
 
 
@@ -12,17 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Dict, Tuple
+
 import ray
 
 
 @ray.remote
 class WorkerGroupRegisterCenter:
     def __init__(self, rank_zero_info):
         self.rank_zero_info = rank_zero_info
+        # rank -> node_id
+        self.workers_info: Dict[int, str] = {}
 
     def get_rank_zero_info(self):
         return self.rank_zero_info
 
+    def set_worker_info(self, rank, node_id) -> None:
+        self.workers_info[rank] = node_id
+
+    def get_worker_info(self) -> Dict[int, str]:
+        return self.workers_info
+
 
 def create_worker_group_register_center(name, info):
     return WorkerGroupRegisterCenter.options(name=name).remote(info)
@@ -19,6 +19,8 @@
 import socket
 from dataclasses import dataclass
 
+import ray
+
 from .decorator import Dispatch, Execute, register
 
 
@@ -125,6 +127,11 @@ def _configure_before_init(self, register_center_name: str, rank: int):
                 )
 
             os.environ.update(rank_zero_info)
+        else:
+            self.register_center = ray.get_actor(register_center_name)
+
+        # set worker info for node affinity scheduling
+        ray.get(self.register_center.set_worker_info.remote(rank, ray.get_runtime_context().get_node_id()))
 
     def __init__(self, cuda_visible_devices=None) -> None:
         # construct a meta from envrionment variable. Note that the import must be inside the class because it is executed remotely