add multi turn rollout test

wuxibin89 · wuxibin89 · commit 736ca1f91827 · 2025-04-19T17:06:29.000+08:00
diff --git a/recipe/dapo/src/dapo_ray_trainer.py b/recipe/dapo/src/dapo_ray_trainer.py
@@ -35,7 +35,7 @@ class RayDAPOTrainer(RayPPOTrainer):
     Note that this trainer runs on the driver process on a single CPU/GPU node.
     """
 
-    def fit(self):
+    async def fit(self):
         """
         The training loop of PPO.
         The driver process only need to call the compute functions of the worker group through RPC to construct the PPO dataflow.
diff --git a/recipe/dapo/src/main_dapo.py b/recipe/dapo/src/main_dapo.py
@@ -75,7 +75,7 @@ def run_ppo(config) -> None:
 @ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
 class TaskRunner:
 
-    def run(self, config):
+    async def run(self, config):
         from verl.utils.fs import copy_to_local
         # print initial config
         from pprint import pprint
@@ -186,7 +186,7 @@ def run(self, config):
                                  reward_fn=reward_fn,
                                  val_reward_fn=val_reward_fn)
         trainer.init_workers()
-        trainer.fit()
+        await trainer.fit()
 
 
 if __name__ == '__main__':
diff --git a/recipe/prime/main_prime.py b/recipe/prime/main_prime.py
@@ -30,6 +30,7 @@
 """
 from .prime_ray_trainer import RayPRIMETrainer
 
+import asyncio
 import ray
 import hydra
 
@@ -54,6 +55,10 @@ def run_prime(config, compute_score=None):
 
 @ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
 def main_task(config, compute_score=None):
+    asyncio.run(_main_task(config, compute_score))
+
+
+async def _main_task(config, compute_score=None):
     from verl.utils.fs import copy_local_path_from_hdfs
     # print initial config
     from pprint import pprint
@@ -132,7 +137,7 @@ def main_task(config, compute_score=None):
                               reward_fn=reward_fn,
                               val_reward_fn=val_reward_fn)
     trainer.init_workers()
-    trainer.fit()
+    await trainer.fit()
 
 
 if __name__ == '__main__':
diff --git a/recipe/prime/prime_ray_trainer.py b/recipe/prime/prime_ray_trainer.py
@@ -308,7 +308,7 @@ def _load_checkpoint(self):
         if isinstance(self.train_dataloader.dataset, RLHFDataset):
             self.train_dataloader.dataset.resume_dataset_state()
 
-    def fit(self):
+    async def fit(self):
         """
         The training loop of PPO.
         The driver process only need to call the compute functions of the worker group through RPC to construct the PPO dataflow.
diff --git a/tests/rollout/test_vllm_multi_turn.py b/tests/rollout/test_vllm_multi_turn.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+from typing import Any, Dict
+
+import ray
+from omegaconf import OmegaConf
+from openai.types.chat.chat_completion import ChatCompletion
+
+from verl.workers.rollout.chat_scheduler import ChatCompletionScheduler
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+from verl.workers.fsdp_async_workers import AsyncActorRolloutRefWorker, AsyncLLMManager
+from verl.single_controller.ray import RayWorkerGroup, RayClassWithInitArgs
+from verl.single_controller.ray.base import Worker, create_colocated_worker_cls
+
+
+async def test_vllm_multi_turn():
+    config = OmegaConf.load("verl/trainer/config/ppo_trainer.yaml")
+    model_path = "Qwen/Qwen2-7B-Instruct"
+    model_name = "/".join(model_path.split("/")[-2:])
+    config.actor_rollout_ref.model.path = model_path
+    config.actor_rollout_ref.rollout.mode = "async"
+    config.actor_rollout_ref.rollout.prompt_length = 4096
+    config.actor_rollout_ref.rollout.response_length = 4096
+
+    # =========================== 1. Create hybrid ActorRollout workers ===========================
+    ray.init(
+        runtime_env={
+            'env_vars': {
+                'TOKENIZERS_PARALLELISM': 'true',
+                'NCCL_DEBUG': 'WARN',
+                'VLLM_LOGGING_LEVEL': 'WARN',
+                'VLLM_USE_V1': '1',
+            }
+        })
+    role_worker_mapping = {
+        Role.ActorRollout: ray.remote(AsyncActorRolloutRefWorker),
+    }
+    global_pool_id = 'global_pool'
+    resource_pool_spec = {
+        global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+    }
+    mapping = {
+        Role.ActorRollout: global_pool_id,
+    }
+    resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+    resource_pool_manager.create_resource_pool()
+    resource_pool_to_cls = {pool: {} for pool in resource_pool_manager.resource_pool_dict.values()}
+
+    # create actor and rollout
+    resource_pool = resource_pool_manager.get_resource_pool(Role.ActorRollout)
+    actor_rollout_cls = RayClassWithInitArgs(cls=role_worker_mapping[Role.ActorRollout],
+                                             config=config.actor_rollout_ref,
+                                             role='actor_rollout')
+    resource_pool_to_cls[resource_pool]['actor_rollout'] = actor_rollout_cls
+
+    all_wg = {}
+    wg_dicts = []
+    for resource_pool, class_dict in resource_pool_to_cls.items():
+        worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict, worker_cls=Worker)
+        wg_dict = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=worker_dict_cls)
+        spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+        all_wg.update(spawn_wg)
+        wg_dicts.append(wg_dict)
+    actor_rollout_wg = all_wg['actor_rollout']
+    actor_rollout_wg.init_model()
+
+    # =========================== 2. Create AsyncLLMManager&ChatScheduler  ===========================
+    async_rollout_manager = AsyncLLMManager(
+        config=config.actor_rollout_ref,
+        worker_group=actor_rollout_wg,
+    )
+
+    async_chat_scheduler = ChatCompletionScheduler(
+        config=config.actor_rollout_ref.rollout,
+        model_path=config.actor_rollout_ref.model.path,
+        server_addresses=async_rollout_manager.server_addresses,
+    )
+
+    # =========================== 3. Multi turn rollout  ===========================
+    async def callback(completions: ChatCompletion, info: Dict[str, Any]):
+        messages, round = info["messages"], info["round"]
+        message = completions.choices[0].message
+        messages.append({"role": message.role, "content": message.content})
+        print(f"[round={round}] role: {message.role}, content: {message.content}")
+
+        extra_headers = {"x-request-id": completions.id}
+        if round == 0:
+            messages.append({"role": "user", "content": "What is your name?"})
+            await async_chat_scheduler.submit_chat_completions(
+                callback=callback,
+                callback_additional_info={
+                    "messages": messages,
+                    "round": 1
+                },
+                model=model_name,
+                messages=messages,
+                extra_headers=extra_headers,
+            )
+        elif round == 1:
+            messages.append({"role": "user", "content": "What is your favorite color?"})
+            await async_chat_scheduler.submit_chat_completions(
+                callback=callback,
+                callback_additional_info={
+                    "messages": messages,
+                    "round": 2
+                },
+                model=model_name,
+                messages=messages,
+                extra_headers=extra_headers,
+            )
+        else:
+            print("Done!")
+
+    messages = [{
+        "role": "user",
+        "content": "Let's play a role playing game. Your name is Bob, your favorite color is red."
+    }]
+    await async_chat_scheduler.submit_chat_completions(
+        callback=callback,
+        callback_additional_info={
+            "messages": messages,
+            "round": 0
+        },
+        model=model_name,
+        messages=messages,
+    )
+    assert len(messages) == 6
+    for round, message in enumerate(messages):
+        if round % 2 == 0:
+            assert message["role"] == "user"
+        else:
+            assert message["role"] == "assistant"
+
+
+if __name__ == "__main__":
+    asyncio.run(test_vllm_multi_turn())
diff --git a/verl/single_controller/ray/base.py b/verl/single_controller/ray/base.py
@@ -475,7 +475,7 @@ def _unwrap_ray_remote(cls):
 
 def _nearest_common_base(mros: List):
     last_common = object
-    min_len = min([len(mro) for mro in mros])
+    min_len = min([len(mro) for mro in mros]) - 1  # exclude final derived class
 
     for i in range(min_len):
         mro = mros[0][i]
@@ -487,15 +487,16 @@ def _nearest_common_base(mros: List):
     return last_common
 
 
-def create_colocated_worker_cls(class_dict: dict[str, RayClassWithInitArgs]):
+def create_colocated_worker_cls(class_dict: dict[str, RayClassWithInitArgs], worker_cls: type = None):
     """
     This function should return a class instance that delegates the calls to every 
     cls in cls_dict
     """
     cls_dict = {}
     init_args_dict = {}
-    worker_cls = _nearest_common_base(
-        [list(reversed(cls.cls.__ray_actor_class__.__mro__)) for cls in class_dict.values()])
+    if worker_cls is None:
+        worker_cls = _nearest_common_base(
+            [list(reversed(cls.cls.__ray_actor_class__.__mro__)) for cls in class_dict.values()])
     assert issubclass(worker_cls, Worker), f"worker_cls {worker_cls} should be a subclass of Worker"
     print(f"find nearest common base class {worker_cls}")
 
diff --git a/verl/workers/rollout/chat_scheduler.py b/verl/workers/rollout/chat_scheduler.py
@@ -1,5 +1,4 @@
 import heapq
-from abc import ABC, abstractmethod
 from uuid import uuid4
 from typing import Any, Callable, Dict, List
 
@@ -12,7 +11,7 @@
 from verl.protocol import DataProto
 
 
-class ChatCompletionScheduler(ABC):
+class ChatCompletionScheduler:
 
     def __init__(self, config: DictConfig, model_path: str, server_addresses: List[str], max_cache_size: int = 10000):
         """
@@ -52,8 +51,16 @@ async def submit_chat_completions(
             **chat_complete_request: dict, request parameters same as OpenAI AsyncCompletions.create.
                 OpenAI API reference: https://platform.openai.com/docs/api-reference/chat/create
         """
-        request_id = chat_complete_request.get("extra_headers", {}).get("x-request-id", None)
+        if "extra_headers" not in chat_complete_request:
+            chat_complete_request["extra_headers"] = {}
+
+        extra_headers = chat_complete_request["extra_headers"]
+        request_id = extra_headers.get("x-request-id", None)
         if request_id:
+            if request_id.startswith("chatcmpl-"):
+                request_id = request_id[len("chatcmpl-"):]
+                extra_headers["x-request-id"] = request_id
+
             address = self.request_id_to_address[request_id]
         else:
             address = self.weighted_addresses[0][1]
@@ -62,8 +69,6 @@ async def submit_chat_completions(
 
             request_id = uuid4().hex
             self.request_id_to_address[request_id] = address
-            if "extra_headers" not in chat_complete_request:
-                chat_complete_request["extra_headers"] = {}
             chat_complete_request["extra_headers"]["x-request-id"] = request_id
 
         # TODO: OpenAI client uses httpx, seems to have performance issue in high concurrency requests.
@@ -91,6 +96,5 @@ async def _chat_completions_aiohttp(self, address: str, **chat_complete_request)
         finally:
             await session.close()
 
-    @abstractmethod
     async def generate_sequences(self, prompts: DataProto, **sampling_params) -> DataProto:
         raise NotImplementedError