[fsdp] fix: set _set_allocator_settings to True to avoid fsdp2 oom (volcengine#3020)

chenhaiq · vermouth1992 · huangjunyi.0 · commit 599eabe8910f · 2025-08-15T15:08:37.000+08:00
### What does this PR do?

Enable expandable_segments to avoid the increasing memory fragmentation
caused by temporary variables during the training process of fsdp2,
which may trigger probabilistic out-of-memory (OOM) errors.

Since both sglang and vllm can not work with expandable_segments:True,
it has to be turn off during rollout.


### Test

Without this fix, memory reserved could be very high after
compute_log_prob or update_actor.
```
(WorkerDict pid=339320) [2025-08-11 17:43:01] dp actor After compute_log_prob, memory allocated (GB): 5.53, memory reserved (GB): 73.59, device memory used/total (GB): 77.47/79.15
```

With this fix, it stays low during training.
```
(WorkerDict pid=396879) [2025-08-12 07:39:42] dp actor After compute_log_prob, memory allocated (GB): 4.95, memory reserved (GB): 14.20, device memory used/total (GB): 17.72/79.15
```

---------
Co-authored-by: narutolhy &lt;luhongyu.4869@bytedance.com&gt;"
Co-authored-by: Chi Zhang &lt;zhangchi.usc1992@bytedance.com&gt;
diff --git a/verl/utils/device.py b/verl/utils/device.py
@@ -84,3 +84,12 @@ def get_nccl_backend() -> str:
         return "hccl"
     else:
         raise RuntimeError(f"No available nccl backend found on device type {get_device_name()}.")
+
+
+def set_expandable_segments(enable: bool) -> None:
+    """Enable or disable expandable segments for cuda.
+    Args:
+        enable (bool): Whether to enable expandable segments. Used to avoid OOM.
+    """
+    if is_cuda_available:
+        torch.cuda.memory._set_allocator_settings(f"expandable_segments:{enable}")
diff --git a/verl/workers/sharding_manager/fsdp_sglang.py b/verl/workers/sharding_manager/fsdp_sglang.py
@@ -26,7 +26,7 @@
 
 from verl import DataProto
 from verl.protocol import all_gather_data_proto
-from verl.utils.device import get_device_id, get_torch_device
+from verl.utils.device import get_device_id, get_torch_device, set_expandable_segments
 from verl.utils.fsdp_utils import fsdp_version, load_fsdp_model_to_gpu, offload_fsdp_model_to_cpu
 from verl.utils.model import convert_weight_keys
 from verl.utils.profiler import GPUMemoryLogger, log_gpu_memory_usage, simple_timer
@@ -144,6 +144,10 @@ async def wake_up(self):
 
         log_gpu_memory_usage("After offload_param in sharding manager memory", logger=logger)
 
+        # sglang need to set _set_allocator_settings to False
+        logger.debug("fsdp sglang sharding_manager _set_allocator_settings to False")
+        set_expandable_segments(False)
+
         if self.device_mesh["infer_tp"].get_local_rank() == 0 and self.rollout_config.free_cache_engine:
             if self.multi_stage_wake_up:
                 await self.inference_engine.resume_memory_occupation(tags=["weights"])
@@ -185,6 +189,11 @@ async def sleep(self):
         # add empty cache after each compute
         get_torch_device().empty_cache()
 
+        # always set _set_allocator_settings to True when using sglang
+        # it is required by fsdp2 to avoid oom
+        logger.debug("fsdp sglang sharding_manager _set_allocator_settings to True")
+        set_expandable_segments(True)
+
         # restore random states
         if self.device_mesh is not None:
             self.gen_random_states = get_torch_device().get_rng_state()
diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -34,7 +34,7 @@
 from verl.protocol import all_gather_data_proto
 from verl.third_party.vllm import LLM
 from verl.third_party.vllm import parallel_state as vllm_ps
-from verl.utils.device import get_device_id, get_device_name, get_torch_device
+from verl.utils.device import get_device_id, get_device_name, get_torch_device, set_expandable_segments
 from verl.utils.fsdp_utils import (
     fsdp_version,
     layered_summon_lora_params,
@@ -210,6 +210,10 @@ def __collect_lora_params() -> OrderedDict:
                 offload_fsdp_model_to_cpu(self.module)
             log_gpu_memory_usage("After state_dict() in sharding manager memory", logger=logger)
 
+            # vllm need to set _set_allocator_settings to False
+            logger.debug("fsdp vllm sharding_manager _set_allocator_settings to False")
+            set_expandable_segments(False)
+
             if self.rollout_config.free_cache_engine:
                 if "tags" in inspect.signature(self.inference_engine.wake_up).parameters:
                     self.inference_engine.wake_up(tags=["weights"])
@@ -245,6 +249,10 @@ def __exit__(self, exc_type, exc_value, traceback):
         # add empty cache after each compute
         get_torch_device().empty_cache()
 
+        # _set_allocator_settings to True is required by fsdp2 to avoid oom
+        logger.debug("fsdp vllm sharding_manager _set_allocator_settings to True")
+        set_expandable_segments(True)
+
         # restore random states
         if self.device_mesh is not None:
             self.gen_random_states = get_torch_device().get_rng_state()