volcengine · Hecate0821 · Jul 26, 2025 · Jul 26, 2025 · Jul 26, 2025 · Jul 26, 2025
diff --git a/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh b/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh
@@ -88,6 +88,7 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.rollout.val_kwargs.top_p=0.6 \
     actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
     actor_rollout_ref.rollout.val_kwargs.n=30 \
+    algorithm.filter_groups.enable=True \
     trainer.logger=['console','wandb'] \
     trainer.project_name=sglang-dapo-multiturn \
     trainer.experiment_name=qwen3_4b_sft_dapo_multiturn \

@@ -432,10 +432,14 @@ reward_model:
     override_transformer_config: ${oc.select:actor_rollout_ref.actor.megatron.override_transformer_config,{}}
     use_mbridge: ${oc.select:actor_rollout_ref.actor.megatron.use_mbridge,False}
   load_weight: true
-custom_reward_function:
-  path: null
-  name: compute_score
 algorithm:
+  filter_groups:
+    _target_: verl.trainer.config.FilterGroupsConfig
+    enable: false
+    metric: seq_reward
+    max_num_gen_batches: 10
+    filter_function: verl.utils.filtering.dynamic_filtering.keep_mixed_reward
+    filter_kwargs: {}
   _target_: verl.trainer.config.AlgoConfig
   gamma: 1.0
   lam: 1.0
@@ -453,6 +457,9 @@ algorithm:
   pf_ppo:
     reweight_method: pow
     weight_pow: 2.0
+custom_reward_function:
+  path: null
+  name: compute_score
 trainer:
   balance_batch: true
   total_epochs: 30

@@ -394,10 +394,14 @@ reward_model:
     save_path: ${oc.select:global_profiler.save_path,null}
     tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
   ulysses_sequence_parallel_size: 1
-custom_reward_function:
-  path: null
-  name: compute_score
 algorithm:
+  filter_groups:
+    _target_: verl.trainer.config.FilterGroupsConfig
+    enable: false
+    metric: seq_reward
+    max_num_gen_batches: 10
+    filter_function: verl.utils.filtering.dynamic_filtering.keep_mixed_reward
+    filter_kwargs: {}
   _target_: verl.trainer.config.AlgoConfig
   gamma: 1.0
   lam: 1.0
@@ -415,6 +419,9 @@ algorithm:
   pf_ppo:
     reweight_method: pow
     weight_pow: 2.0
+custom_reward_function:
+  path: null
+  name: compute_score
 trainer:
   balance_batch: true
   total_epochs: 30

@@ -48,12 +48,19 @@ class FilterGroupsConfig(BaseConfig):
     Args:
         enable (bool): Whether to enable filter groups.
         metric (Optional[str]): Metric to use for filtering: "acc", "score", "seq_reward", "seq_final_reward", etc.
-        max_num_gen_batches (int): Non-positive values mean no upper limit.
+        max_num_gen_batches (int): Maximum number of backfill attempts when collecting diverse responses.
+                                   Non-positive values mean no upper limit (use with caution).
+        filter_function (Optional[str]): Path to filter function (e.g., "my_module.my_filter_func").
+                                        Required when filter_groups is enabled. For the original mixed rewards
+                                        filter, use "verl.utils.filtering.dynamic_filtering.keep_mixed_reward".
+        filter_kwargs (Optional[dict]): Additional arguments for the filter function.
     """
 
     enable: bool = False
     metric: Optional[str] = None
     max_num_gen_batches: int = 0
+    filter_function: Optional[str] = "verl.utils.filtering.dynamic_filtering.keep_mixed_reward"
+    filter_kwargs: Optional[dict] = field(default_factory=dict)
 
 
 @dataclass
@@ -72,7 +79,7 @@ class AlgoConfig(BaseConfig):
         kl_ctrl (KLControlConfig): KL control configuration.
         use_pf_ppo (bool): Whether to enable preference feedback PPO.
         pf_ppo (dict[str, Any]): Preference feedback PPO settings.
-        filter_groups (Optional[FilterGroupsConfig]): Filter groups configuration, used in DAPO and Entropy
+        filter_groups (Optional[FilterGroupsConfig]): Dynamic filter configuration, used in DAPO and Entropy
     """
 
     gamma: float = 1.0

@@ -0,0 +1,26 @@
+# Format checks enforced on CI:
+# 1. Comments must appear above each field.
+# 2. There must be a blank line between each field.
+# 3. Inline comments (after a field on the same line) are not allowed.
+# 4. Indentation level is respected for nested fields.
+
+# Dynamic filter for DAPO: filters out homogeneous groups, keeps diverse responses
+
+# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+_target_: verl.trainer.config.FilterGroupsConfig
+
+# Whether to enable dynamic filter
+enable: False
+
+# Metric to use for dynamic filter: currently only "seq_reward" is supported
+metric: seq_reward
+
+# Maximum number of backfill attempts when collecting diverse responses
+# If set to 0 or negative, allows unlimited backfill attempts (use with caution)
+max_num_gen_batches: 10
+
+# Default filter function for mixed reward filtering
+filter_function: verl.utils.filtering.dynamic_filtering.keep_mixed_reward
+
+# Additional arguments for the filter function
+filter_kwargs: {}
@@ -16,6 +16,8 @@ defaults:
   - critic@critic: megatron_critic
   # Reward model config.
   - reward_model@reward_model: megatron_reward_model
+  # Algorithm filter groups config.
+  - algorithm/[email protected]_groups
   - _self_
 
 actor_rollout_ref:

@@ -27,6 +27,9 @@ defaults:
   # Reward model config.
   - reward_model@reward_model: dp_reward_model
 
+  # Algorithm filter groups config.
+  - algorithm/[email protected]_groups
+
   # load the reference default config, then apply the fields in the current yaml
   # self config override anything above
   - _self_

@@ -488,3 +488,36 @@ def process_validation_metrics(
                 data_src2var2metric2val[data_source][var_name][metric_name] = np.mean(uid_vals)
 
     return data_src2var2metric2val
+
+
+def compute_reward_metrics(batch: DataProto) -> dict[str, Any]:
+    """
+    Computes reward-related metrics from a batch of data for PPO training.
+
+    This function computes metrics from the RAW batch BEFORE any dynamic filtering
+    is applied. When using dynamic filtering (DAPO), this captures the reward distribution
+    of ALL generated responses, including those that will be filtered out for being too
+    homogeneous. This provides insight into the raw reward signal quality before diversity
+    filtering removes low-variance response groups.
+
+    This function calculates statistics (mean, std, max, min) for sequence-level rewards
+    derived from token-level scores.
+
+    Args:
+        batch: A DataProto object containing batch data with token-level scores
+
+    Returns:
+        A dictionary of reward metrics including:
+            - before_filtering/reward/mean: Mean sequence reward (pre-filtering)
+            - before_filtering/reward/std: Standard deviation of sequence rewards (pre-filtering)
+            - before_filtering/reward/max: Maximum sequence reward (pre-filtering)
+            - before_filtering/reward/min: Minimum sequence reward (pre-filtering)
+    """
+    seq_reward_tensor = batch.batch["token_level_scores"].sum(-1)
+
+    return {
+        "before_filtering/reward/mean": seq_reward_tensor.mean().detach().item(),
+        "before_filtering/reward/std": seq_reward_tensor.std().detach().item(),
+        "before_filtering/reward/max": seq_reward_tensor.max().detach().item(),
+        "before_filtering/reward/min": seq_reward_tensor.min().detach().item(),
+    }
@@ -45,20 +45,28 @@
 from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
 from verl.trainer.ppo.metric_utils import (
     compute_data_metrics,
+    compute_reward_metrics,
     compute_throughout_metrics,
     compute_timing_metrics,
     process_validation_metrics,
 )
-from verl.trainer.ppo.reward import compute_reward, compute_reward_async
-from verl.trainer.ppo.utils import Role, WorkerType, need_critic, need_reference_policy, need_reward_model
+from verl.trainer.ppo.reward import compute_reward, compute_reward_async, extract_reward_extra_infos
+from verl.trainer.ppo.utils import (
+    Role,
+    WorkerType,
+    need_critic,
+    need_reference_policy,
+    need_reward_model,
+)
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi
 from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.debug import marked_timer
+from verl.utils.filtering.dynamic_filtering import DynamicFilter
 from verl.utils.metric import reduce_metrics
 from verl.utils.rollout_skip import RolloutSkip
 from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
 from verl.utils.torch_functional import masked_mean
-from verl.utils.tracking import ValidationGenerationsLogger
+from verl.utils.tracking import Tracking, ValidationGenerationsLogger
 
 
 @dataclass
@@ -352,6 +360,12 @@ def __init__(
         if self.config.algorithm.use_kl_in_reward:
             self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
 
+        self.dynamic_filter = (
+            DynamicFilter(config=self.config)
+            if self.config.algorithm.filter_groups and self.config.algorithm.filter_groups.enable
+            else None
+        )
+
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
     def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler]):
@@ -909,15 +923,12 @@ def fit(self):
         """
         from omegaconf import OmegaConf
 
-        from verl.utils.tracking import Tracking
-
         logger = Tracking(
             project_name=self.config.trainer.project_name,
             experiment_name=self.config.trainer.experiment_name,
             default_backend=self.config.trainer.logger,
             config=OmegaConf.to_container(self.config, resolve=True),
         )
-
         self.global_steps = 0
 
         # load checkpoint before doing anything
@@ -958,6 +969,10 @@ def fit(self):
                 metrics = {}
                 timing_raw = {}
 
+                # dynamic filter
+                if self.dynamic_filter:
+                    self.dynamic_filter.increment_gen_batches()
+
                 with marked_timer("start_profile", timing_raw):
                     self._start_profiling(
                         not prev_step_profile and curr_step_profile
@@ -1017,27 +1032,56 @@ def fit(self):
 
                     if "response_mask" not in batch.batch.keys():
                         batch.batch["response_mask"] = compute_response_mask(batch)
-                    # Balance the number of valid tokens across DP ranks.
-                    # NOTE: This usually changes the order of data in the `batch`,
-                    # which won't affect the advantage calculation (since it's based on uid),
-                    # but might affect the loss calculation (due to the change of mini-batching).
-                    # TODO: Decouple the DP balancing and mini-batching.
-                    if self.config.trainer.balance_batch:
-                        self._balance_batch(batch, metrics=metrics)
-
-                    # compute global_valid tokens
-                    batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
 
+                    # Compute all reward scores in one consolidated block
                     with marked_timer("reward", timing_raw, color="yellow"):
                         # compute reward model score
                         if self.use_rm and "rm_scores" not in batch.batch.keys():
                             reward_tensor = self.rm_wg.compute_rm_score(batch)
                             batch = batch.union(reward_tensor)
 
+                        # compute reward function score
+                        reward_extra_infos_dict = {}
                         if self.config.reward_model.launch_reward_fn_async:
                             future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn)
                         else:
                             reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
+                            # Set token_level_scores immediately for sync case (needed for compute_reward_metrics)
+                            batch.batch["token_level_scores"] = reward_tensor
+
+                            if reward_extra_infos_dict:
+                                batch.non_tensor_batch.update(
+                                    {k: np.array(v) for k, v in reward_extra_infos_dict.items()}
+                                )
+
+                    # Compute reward metrics
+                    if self.dynamic_filter and self.dynamic_filter.increment_reward_step(self.global_steps):
+                        reward_metrics = compute_reward_metrics(batch)
+                        metrics.update(reward_metrics)
+
+                    # Apply dynamic filtering after reward computation
+                    if self.dynamic_filter:
+                        # Apply dynamic filtering and handle batch accumulation
+                        processed_batch, should_continue = self.dynamic_filter.process_batch_with_filtering(
+                            batch,
+                            self.config,
+                        )
+
+                        if should_continue:
+                            continue
+
+                        batch = processed_batch
+
+                    # Balance the number of valid tokens across DP ranks.
+                    # NOTE: This usually changes the order of data in the `batch`,
+                    # which won't affect the advantage calculation (since it's based on uid),
+                    # but might affect the loss calculation (due to the change of mini-batching).
+                    # TODO: Decouple the DP balancing and mini-batching.
+                    if self.config.trainer.balance_batch:
+                        self._balance_batch(batch, metrics=metrics)
+
+                    # compute global_valid tokens
+                    batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
 
                     # recompute old_log_probs
                     with marked_timer("old_log_prob", timing_raw, color="blue"):
@@ -1074,14 +1118,16 @@ def fit(self):
 
                     with marked_timer("adv", timing_raw, color="brown"):
                         # we combine with rule-based rm
-                        reward_extra_infos_dict: dict[str, list]
                         if self.config.reward_model.launch_reward_fn_async:
                             reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
-                        batch.batch["token_level_scores"] = reward_tensor
-
-                        if reward_extra_infos_dict:
-                            batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
+                            # Set token_level_scores for async case
+                            batch.batch["token_level_scores"] = reward_tensor
 
+                            if reward_extra_infos_dict:
+                                batch.non_tensor_batch.update(
+                                    {k: np.array(v) for k, v in reward_extra_infos_dict.items()}
+                                )
+                            # For sync case, token_level_scores and extra_infos are already set above
                         # compute rewards. apply_kl_penalty if available
                         if self.config.algorithm.use_kl_in_reward:
                             batch, kl_metrics = apply_kl_penalty(
@@ -1135,6 +1181,10 @@ def fit(self):
                                 for item in batch
                             ]
 
+                            reward_extra_infos_dict = extract_reward_extra_infos(
+                                batch, set(reward_extra_infos_dict.keys())
+                            )
+
                             if "request_id" in batch.non_tensor_batch:
                                 reward_extra_infos_dict.setdefault(
                                     "request_id",
@@ -1223,6 +1273,10 @@ def fit(self):
                 progress_bar.update(1)
                 self.global_steps += 1
 
+                # Reset dynamic filter state for next training step
+                if self.dynamic_filter:
+                    self.dynamic_filter.clear()
+
                 if (
                     hasattr(self.config.actor_rollout_ref.actor, "profiler")
                     and self.config.actor_rollout_ref.actor.profiler.tool == "torch_memory"

@@ -186,3 +186,12 @@ def compute_reward_async(data: DataProto, config=None, tokenizer=None, reward_fn
         )
 
     return compute_reward(data, reward_fn)
+
+
+def extract_reward_extra_infos(batch: DataProto, reward_extra_info_keys: list[str]) -> dict[str, list]:
+    """Extract reward extra info from batch.non_tensor_batch for dump_generations."""
+    reward_extra_infos_dict = {}
+    for key in reward_extra_info_keys:
+        reward_extra_infos_dict[key] = batch.non_tensor_batch[key]
+
+    return reward_extra_infos_dict
diff --git a/verl/utils/filtering/__init__.py b/verl/utils/filtering/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Reference:
+# - DAPO: An Open-Source LLM Reinforcement Learning System at Scale
+#   Paper: https://arxiv.org/abs/2503.14476
+# - This implementation references the ReTool implementation: recipe/retool/ in VERL codebase
+
+from .dynamic_filtering import DynamicFilter, keep_mixed_reward
+
+__all__ = ["DynamicFilter", "keep_mixed_reward"]