add comments

conver334 · conver334 · commit 880688e852bd · 2025-09-27T23:13:51.000-07:00
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
@@ -56,7 +56,7 @@
 from verl.utils.debug import marked_timer
 from verl.utils.metric import reduce_metrics
 from verl.utils.rollout_skip import RolloutSkip
-from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
+from verl.utils.seqlen_balancing import calculate_workload, get_seqlen_balanced_partitions, log_seqlen_unbalance
 from verl.utils.torch_functional import masked_mean
 from verl.utils.tracking import ValidationGenerationsLogger
 
@@ -905,7 +905,7 @@ def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqle
         attention_mask = batch.batch["attention_mask"]
         batch_size = attention_mask.shape[0]
         global_seqlen_lst = batch.batch["attention_mask"].view(batch_size, -1).sum(-1)  # (train_batch_size,)
-        global_seqlen_lst = global_seqlen_lst**2 + global_seqlen_lst * 33024
+        global_seqlen_lst = calculate_workload(global_seqlen_lst)
         world_size = self.actor_rollout_wg.world_size
         global_partition_lst = get_seqlen_balanced_partitions(
             global_seqlen_lst, k_partitions=world_size, equal_size=True
diff --git a/verl/utils/seqlen_balancing.py b/verl/utils/seqlen_balancing.py
@@ -24,6 +24,16 @@
 from verl.utils.device import get_device_name
 
 
+def calculate_workload(seqlen_list: list[int]):
+    """
+    Calculate the workload for a dense transformer block based on sequence length.
+    FLOPs = 12 * hidden_size^2 * seqlen + 2 * hidden_size * seqlen^2
+    Hardcodes the constants by a 7B model (hidden_size=4096),
+    so the FLOPs are propotional to (6 * 4096 * seqlen + seqlen^2).
+    """
+    return 24576 * seqlen_list + seqlen_list**2
+
+
 def karmarkar_karp(seqlen_list: list[int], k_partitions: int, equal_size: bool):
     # see: https://en.wikipedia.org/wiki/Largest_differencing_method
     class Set:
@@ -300,8 +310,8 @@ def rearrange_micro_batches(
 
     assert num_micro_batches <= len(seq_len_effective)
 
-    # approximate the workload by Attention and MLP FLOPs
-    workloads = seq_len_effective**2 + seq_len_effective * 33024
+    # Approximate workload using transformer FLOPs model
+    workloads = calculate_workload(seq_len_effective)
     micro_bsz_idx = get_seqlen_balanced_partitions(workloads, num_micro_batches, equal_size=False)
 
     if use_dynamic_bsz_balance:
@@ -313,6 +323,7 @@ def rearrange_micro_batches(
             ),
             reverse=True,
         )
+        # Place smaller micro-batches at both ends to reduce the bubbles exposed during the warm-up and cool-down.
         micro_bsz_idx = micro_bsz_idx[::2][::-1] + micro_bsz_idx[1::2]
 
     micro_batches = []