FEAT: Decoupled CLIP ratio (DAPO Trick-I) (#285)

ZiyiTsang · web-flow · commit 8ffa75006b50 · 2025-09-03T23:14:56.000+08:00
* FEAT: add CLIP_higher (DAPO Trick-I)

* Change default value for eps_clip_higher

* rewrite logic in fuctional (CLIP higher)

* 你try to fromatting

* try to fromatting

* modify formula
diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
@@ -258,6 +258,12 @@ class PPOActorConfig(TrainEngineConfig):
     eps_clip: float = field(
         default=0.2, metadata={"help": "Clipping factor for policy ratio"}
     )
+    eps_clip_higher: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "Clipping factor (higher value) for policy ratio. Defaults is None. When eps_clip_higher is setted (decouppled), eps_clip will be used as the lower value."
+        },
+    )
     c_clip: Optional[float] = field(
         default=None,
         metadata={
diff --git a/areal/engine/ppo/actor.py b/areal/engine/ppo/actor.py
@@ -226,6 +226,7 @@ def ppo_update(self, data: TensorDict) -> List[Dict[str, float]]:
                     grpo_loss_fn,
                     temperature=self.temperature,
                     eps_clip=self.config.eps_clip,
+                    eps_clip_higher=self.config.eps_clip_higher,
                     c_clip=self.config.c_clip,
                     behav_imp_weight_cap=self.config.behav_imp_weight_cap,
                 ),
@@ -262,6 +263,7 @@ def grpo_loss_fn(
     input_data: Dict,
     temperature: float,
     eps_clip: float,
+    eps_clip_higher: float | None,
     c_clip: float | None,
     behav_imp_weight_cap: float | None,
 ):
@@ -282,6 +284,7 @@ def grpo_loss_fn(
         old_logprobs=old_logp,
         advantages=advantages,
         eps_clip=eps_clip,
+        eps_clip_higher=eps_clip_higher,
         loss_mask=loss_mask,
         c_clip=c_clip,
         proximal_logprobs=prox_logp,
diff --git a/areal/utils/functional.py b/areal/utils/functional.py
@@ -126,6 +126,7 @@ def ppo_actor_loss_fn(
     advantages: torch.Tensor,
     eps_clip: float,
     loss_mask: torch.Tensor,
+    eps_clip_higher: Optional[float] = None,
     c_clip: Optional[float] = None,
     behav_imp_weight_cap: Optional[float] = None,
 ) -> Tuple[torch.Tensor, Dict]:
@@ -139,7 +140,13 @@ def ppo_actor_loss_fn(
     """
     loss_mask_count = loss_mask.count_nonzero() or 1
     ratio = torch.where(loss_mask, torch.exp(logprobs - proximal_logprobs), 0)
-    clipped_ratio = torch.clamp(ratio, 1.0 - eps_clip, 1.0 + eps_clip)
+
+    clipped_ratio = torch.clamp(
+        ratio,
+        1.0 - eps_clip,
+        1.0 + (eps_clip if eps_clip_higher is None else eps_clip_higher),
+    )
+
     pg_loss1 = -advantages * ratio
     pg_loss2 = -advantages * clipped_ratio
     clip_mask = pg_loss1.detach() < pg_loss2.detach()