resolve comment

PrinsYin · PrinsYin · commit 778e318946ee · 2025-08-31T21:07:06.000Z
diff --git a/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh b/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh
@@ -49,7 +49,7 @@ python3 -m verl.trainer.main_ppo \
     data.train_files=$HOME/data/BytedTsinghua-SIA/DAPO-Math-17k \
     data.val_files=$HOME/data/Maxwell-Jia/AIME_2024 \
     data.return_raw_chat=True \
-    data.train_batch_size=32 \
+    data.train_batch_size=16 \
     data.max_prompt_length=2048 \
     data.max_response_length=16384 \
     data.filter_overlong_prompts=True \
@@ -68,7 +68,7 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.actor.clip_ratio_c=10.0 \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.actor.use_dynamic_bsz=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=32 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=16 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
     actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \
     actor_rollout_ref.rollout.name=sglang \
@@ -84,17 +84,18 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.rollout.multi_turn.max_assistant_turns=16 \
     actor_rollout_ref.rollout.multi_turn.tool_config_path=$PROJECT_DIR/recipe/retool/sandbox_fusion_tool_config.yaml \
     actor_rollout_ref.rollout.multi_turn.format=hermes \
-    actor_rollout_ref.rollout.n=8 \
+    actor_rollout_ref.rollout.n=4 \
     actor_rollout_ref.rollout.val_kwargs.top_p=0.6 \
     actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
     actor_rollout_ref.rollout.val_kwargs.n=30 \
     algorithm.filter_groups.enable=True \
+    algorithm.filter_groups.max_num_gen_batches=2 \
     trainer.logger=['console','wandb'] \
     trainer.project_name=sglang-dapo-multiturn \
     trainer.experiment_name=qwen3_4b_sft_dapo_multiturn \
     trainer.n_gpus_per_node=8 \
     trainer.log_val_generations=20 \
-    trainer.val_before_train=True \
+    trainer.val_before_train=False \
     trainer.nnodes=1 \
     trainer.save_freq=-1 \
     trainer.test_freq=20 \
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
@@ -50,20 +50,18 @@
     compute_timing_metrics,
     process_validation_metrics,
 )
-from verl.trainer.ppo.reward import compute_reward, compute_reward_async
+from verl.trainer.ppo.reward import compute_reward, compute_reward_async, extract_reward_extra_infos
 from verl.trainer.ppo.utils import (
     Role,
     WorkerType,
-    extract_reward_extra_infos,
     need_critic,
     need_reference_policy,
     need_reward_model,
 )
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi
 from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.debug import marked_timer
-from verl.utils.filtering import DynamicFilterState
-from verl.utils.filtering.dynamic_filtering import DynamicFilterManager
+from verl.utils.filtering.dynamic_filtering import DynamicFilter
 from verl.utils.metric import reduce_metrics
 from verl.utils.rollout_skip import RolloutSkip
 from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
@@ -362,9 +360,8 @@ def __init__(
         if self.config.algorithm.use_kl_in_reward:
             self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
 
-        # initialize dynamic filter manager
-        self.dynamic_filter_manager = (
-            DynamicFilterManager(config=self.config)
+        self.dynamic_filter = (
+            DynamicFilter(config=self.config)
             if self.config.algorithm.filter_groups and self.config.algorithm.filter_groups.enable
             else None
         )
@@ -959,15 +956,16 @@ def fit(self):
             else False
         )
         next_step_profile = False
-        dynamic_filter_state = DynamicFilterState()
+
 
         for epoch in range(self.config.trainer.total_epochs):
             for batch_dict in self.train_dataloader:
                 metrics = {}
                 timing_raw = {}
 
                 # dynamic filter
-                dynamic_filter_state.increment_gen_batches()
+                if self.dynamic_filter:
+                    self.dynamic_filter.increment_gen_batches()
 
                 with marked_timer("start_profile", timing_raw):
                     self._start_profiling(
@@ -1051,16 +1049,15 @@ def fit(self):
                                 )
 
                     # Compute reward metrics
-                    if dynamic_filter_state.increment_reward_step(self.global_steps):
+                    if self.dynamic_filter and self.dynamic_filter.increment_reward_step(self.global_steps):
                         reward_metrics = compute_reward_metrics(batch)
                         metrics.update(reward_metrics)
 
                     # Apply dynamic filtering after reward computation
-                    if self.dynamic_filter_manager:
+                    if self.dynamic_filter:
                         # Apply dynamic filtering and handle batch accumulation
-                        processed_batch, should_continue = self.dynamic_filter_manager.process_batch_with_filtering(
+                        processed_batch, should_continue = self.dynamic_filter.process_batch_with_filtering(
                             batch,
-                            dynamic_filter_state,
                             self.config,
                         )
 
@@ -1279,7 +1276,8 @@ def fit(self):
                 self.global_steps += 1
 
                 # Reset dynamic filter state for next training step
-                dynamic_filter_state.clear()
+                if self.dynamic_filter:
+                    self.dynamic_filter.clear()
 
                 if (
                     hasattr(self.config.actor_rollout_ref.actor, "profiler")
diff --git a/verl/trainer/ppo/reward.py b/verl/trainer/ppo/reward.py
@@ -186,3 +186,11 @@ def compute_reward_async(data: DataProto, config=None, tokenizer=None, reward_fn
         )
 
     return compute_reward(data, reward_fn)
+
+def extract_reward_extra_infos(batch: DataProto, reward_extra_info_keys: list[str]) -> dict[str, list]:
+    """Extract reward extra info from batch.non_tensor_batch for dump_generations."""
+    reward_extra_infos_dict = {}
+    for key in reward_extra_info_keys:
+        reward_extra_infos_dict[key] = batch.non_tensor_batch[key]
+
+    return reward_extra_infos_dict
diff --git a/verl/trainer/ppo/utils.py b/verl/trainer/ppo/utils.py
@@ -64,12 +64,3 @@ def need_critic(config: DictConfig) -> bool:
             stacklevel=2,
         )
         return False
-
-
-def extract_reward_extra_infos(batch: DataProto, reward_extra_info_keys: list[str]) -> dict[str, list]:
-    """Extract reward extra info from batch.non_tensor_batch for dump_generations."""
-    reward_extra_infos_dict = {}
-    for key in reward_extra_info_keys:
-        reward_extra_infos_dict[key] = batch.non_tensor_batch[key]
-
-    return reward_extra_infos_dict
diff --git a/verl/utils/filtering/__init__.py b/verl/utils/filtering/__init__.py
@@ -18,6 +18,6 @@
 #   Paper: https://arxiv.org/abs/2503.14476
 # - This implementation references the ReTool implementation: recipe/retool/ in VERL codebase
 
-from .dynamic_filtering import DynamicFilterState
+from .dynamic_filtering import DynamicFilter, keep_mixed_reward
 
-__all__ = ["DynamicFilterState"]
+__all__ = ["DynamicFilter", "keep_mixed_reward"]
diff --git a/verl/utils/filtering/dynamic_filtering.py b/verl/utils/filtering/dynamic_filtering.py
@@ -27,18 +27,39 @@
 from verl import DataProto
 
 
-@dataclass
-class DynamicFilterState:
-    """State tracking for dynamic filtering during batch processing."""
+class DynamicFilter:
+    """Unified class for handling dynamic filtering during training with state management."""
 
-    num_gen_batches: int = 0
-    num_prompt_in_batch: int = 0
-    accumulated_batch: Optional[DataProto] = None
-    reward_step: int = 0
+    def __init__(self, config):
+        """Initialize the dynamic filter.
+
+        Args:
+            config: configuration from ray_trainer
+        """
+        # Configuration attributes
+        self.metric = config.algorithm.filter_groups.metric
+        self.filter_kwargs = config.algorithm.filter_groups.filter_kwargs
+        self.custom_filter_func = None
+        self.filter_function = config.algorithm.filter_groups.filter_function
+
+        # State attributes
+        self.num_gen_batches: int = 0
+        self.num_prompt_in_batch: int = 0
+        self.accumulated_batch: Optional[DataProto] = None
+        self.reward_step: int = 0
+
+        assert not config.reward_model.launch_reward_fn_async, (
+            "Dynamic filter has not supported async reward function yet."
+        )
+
+        if self.filter_function:
+            # Import custom filter function
+            module_path, func_name = self.filter_function.rsplit(".", 1)
+            module = importlib.import_module(module_path)
+            self.custom_filter_func = getattr(module, func_name)
 
     def clear(self) -> None:
         """Reset all state variables for the next training step."""
-
         if self.num_gen_batches > 0:
             print(f"Dynamic Filter: Used {self.num_gen_batches} generation batches to complete this step")
 
@@ -48,6 +69,7 @@ def clear(self) -> None:
         self.reward_step = 0
 
     def increment_reward_step(self, global_step) -> bool:
+        """Increment the reward step if it's less than the global step."""
         if self.reward_step < global_step:
             self.reward_step += 1
             return True
@@ -67,40 +89,13 @@ def accumulate_batch(self, batch: DataProto) -> None:
             batch if self.accumulated_batch is None else DataProto.concat([self.accumulated_batch, batch])
         )
 
-
-@dataclass
-class DynamicFilterManager:
-    """Manager class for handling dynamic filtering during training."""
-
-    def __init__(self, config):
-        """Initialize the filter manager.
-
-        Args:
-            config: configuration from ray_trainer
-        """
-        self.metric = config.algorithm.filter_groups.metric
-        self.filter_kwargs = config.algorithm.filter_groups.filter_kwargs
-        self.custom_filter_func = None
-        self.filter_function = config.algorithm.filter_groups.filter_function
-
-        assert not config.reward_model.launch_reward_fn_async, (
-            "Dynamic filter has not supported async reward function yet."
-        )
-
-        if self.filter_function:
-            # Import custom filter function
-            module_path, func_name = self.filter_function.rsplit(".", 1)
-            module = importlib.import_module(module_path)
-            self.custom_filter_func = getattr(module, func_name)
-
     def process_batch_with_filtering(
-        self, batch: DataProto, dynamic_filter_state: "DynamicFilterState", config
+        self, batch: DataProto, config
     ) -> tuple[DataProto, bool]:
         """Process a batch with dynamic filtering and accumulation logic.
 
         Args:
             batch: The input batch to process
-            dynamic_filter_state: State object tracking filtering progress
             config: configuration from ray_trainer
 
         Returns:
@@ -151,24 +146,24 @@ def process_batch_with_filtering(
 
         # Filter the batch and update state
         filtered_batch = batch[kept_traj_idxs]
-        dynamic_filter_state.add_prompts(kept_prompts_this_batch)
-        dynamic_filter_state.accumulate_batch(filtered_batch)
+        self.add_prompts(kept_prompts_this_batch)
+        self.accumulate_batch(filtered_batch)
 
         # Check if we have enough prompts or reached max generation batches
         if (
-            dynamic_filter_state.num_prompt_in_batch < train_batch_size
-            and dynamic_filter_state.num_gen_batches < max_num_gen_batches
+            self.num_prompt_in_batch < train_batch_size
+            and self.num_gen_batches < max_num_gen_batches
         ):
             return None, True  # Continue collecting more batches
 
         # If we reached max generation batches but still don't have enough prompts,
         # repeat batch content to fill the deficit
-        if dynamic_filter_state.num_gen_batches >= max_num_gen_batches:
-            prompt_deficit = train_batch_size - dynamic_filter_state.num_prompt_in_batch
-            repeated_batch = dynamic_filter_state.accumulated_batch[: prompt_deficit * rollout_n]
-            final_batch = DataProto.concat([dynamic_filter_state.accumulated_batch, repeated_batch])
+        if self.num_gen_batches >= max_num_gen_batches:
+            prompt_deficit = train_batch_size - self.num_prompt_in_batch
+            repeated_batch = self.accumulated_batch[: prompt_deficit * rollout_n]
+            final_batch = DataProto.concat([self.accumulated_batch, repeated_batch])
         else:
-            final_batch = dynamic_filter_state.accumulated_batch
+            final_batch = self.accumulated_batch
 
         # Align the batch to the expected trajectory batch size
         traj_bsz = train_batch_size * rollout_n