Revert "[recipe] feat: Add sleep/wakeup mode for gen rm vllm service and add tqdm showing process" (volcengine#2813)

ETOgaosion · huangjunyi.0 · commit 981f258ac427 · 2025-07-31T14:28:28.000+08:00
Reverts volcengine#2739 For volcengine#2794 to solve all CI faults.
diff --git a/recipe/genrm_remote/README.md b/recipe/genrm_remote/README.md
@@ -7,16 +7,8 @@
 Deploy the pretrained GenRM model using vLLM. Skip this step if you want to use an external api service.
 
 ```bash 
-VLLM_SERVER_DEV_MODE=1 vllm serve verl-team/GenRM-CI-Test-1.5B --served-model-name genrm-demo --enable-sleep-mode --dtype float32
+vllm serve verl-team/GenRM-CI-Test-1.5B --served-model-name genrm-demo
 ```
-Note that the wake_up and sleep operations for managing CUDA memory in vLLM are only available when both `VLLM_SERVER_DEV_MODE=1` and `enable_sleep_mode` are set. This capability is particularly beneficial when the model server shares resources with a training workload on the same machine. It allows the reward model service to be temporarily offloaded (to free up GPU memory) during intensive training sessions and reloaded when the service is required again. The relevant vllm code implementation can be found below:
-
-[VLLM_SERVER_DEV_MODE](https://github.com/vllm-project/vllm/blob/5a19a6c6705fe83db2e3517a2d2f473586901743/vllm/entrypoints/openai/api_server.py#L971)
-
-[sleep and wake_up mode](https://github.com/vllm-project/vllm/blob/5a19a6c6705fe83db2e3517a2d2f473586901743/vllm/entrypoints/openai/api_server.py#L994-L1003)
-
-When the backend is configured as `SERVER_BACKEND`="VLLM", the `USE_OFFLOAD` flag can be toggled between True and False.(see `reward_function.py`)
-
 
 ### Step 2: Perform RL using GenRM
 
diff --git a/recipe/genrm_remote/reward_function.py b/recipe/genrm_remote/reward_function.py
@@ -12,18 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import random
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from concurrent.futures import ThreadPoolExecutor
 from time import sleep
 
 import requests
-import tqdm
 
 from verl.utils.reward_score.math import last_boxed_only_string, remove_boxed
 
-SERVER_BACKEND = "VLLM"
-USE_OFFLOAD = True
 BASE_URL = "http://localhost:30000"
 API_KEY = "EMPTY"
 MAX_RETRIES = 3
@@ -47,13 +42,6 @@
 """.strip()
 
 
-def vllm_execute_method(task="sleep"):
-    assert task in ["sleep", "wake_up"], f"Invalid task: {task}"
-    url_root = BASE_URL
-    response = requests.post(url_root + "/" + task)
-    assert response.status_code == 200
-
-
 def get_response(problem, solution_str, ground_truth):
     prompt = GENRM_PROMPT_TEMPLATE.format(problem=problem, solution=solution_str)
     messages = [{"role": "user", "content": prompt}]
@@ -89,14 +77,14 @@ def compute_reward(response):
     return reward_score
 
 
-def compute_score(data_source, solution_str, ground_truth, extra_info, index):
+def compute_score(data_source, solution_str, ground_truth, extra_info):
     split = extra_info["split"]
     from verl.utils.reward_score import default_compute_score
 
     func_rm_score = default_compute_score(data_source, solution_str, ground_truth, extra_info)
 
     if split == "test":
-        return func_rm_score, index
+        return func_rm_score
     else:
         problem = extra_info["question"]
         response = get_response(problem, solution_str, ground_truth)
@@ -105,29 +93,18 @@ def compute_score(data_source, solution_str, ground_truth, extra_info, index):
         else:
             reward_score = 0.0
 
-        return reward_score, index
+        return reward_score
 
 
 def compute_score_batch(data_sources, solution_strs, ground_truths, extra_infos):
-    results = []
-    indexes = list(range(len(data_sources)))
-    if SERVER_BACKEND == "VLLM" and USE_OFFLOAD:
-        vllm_execute_method("wake_up")
-
     with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
         futures = []
-        for data_source, solution_str, ground_truth, extra_info, index in zip(
-            data_sources, solution_strs, ground_truths, extra_infos, indexes, strict=True
+        for data_source, solution_str, ground_truth, extra_info in zip(
+            data_sources, solution_strs, ground_truths, extra_infos, strict=True
         ):
-            future = executor.submit(compute_score, data_source, solution_str, ground_truth, extra_info, index)
-            time.sleep(0.001 * random.random())
+            future = executor.submit(compute_score, data_source, solution_str, ground_truth, extra_info)
             futures.append(future)
 
-        for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
-            results.append(future.result())
-        results = sorted(results, key=lambda x: x[-1], reverse=False)
-        results = [result[0] for result in results]
+        results = [future.result() for future in futures]
 
-    if SERVER_BACKEND == "VLLM" and USE_OFFLOAD:
-        vllm_execute_method("sleep")
     return results