-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Open
Description
I encountered the following CUDA error while using async SGLang for multiturn GRPO RL.
[2025-05-20 18:43:20 TP3] Scheduler hit an exception: Traceback (most recent call last):
File "/home/jobuser/.local/lib/python3.10/site-packages/sglang/srt/managers/scheduler.py", line 2269, in run_scheduler_process
scheduler.event_loop_overlap()
File "/home/jobuser/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/home/jobuser/.local/lib/python3.10/site-packages/sglang/srt/managers/scheduler.py", line 658, in event_loop_overlap
batch = self.get_next_batch_to_run()
File "/home/jobuser/.local/lib/python3.10/site-packages/sglang/srt/managers/scheduler.py", line 1304, in get_next_batch_to_run
self.running_batch = self.update_running_batch(self.running_batch)
File "/home/jobuser/.local/lib/python3.10/site-packages/sglang/srt/managers/scheduler.py", line 1502, in update_running_batch
batch.prepare_for_decode()
File "/home/jobuser/.local/lib/python3.10/site-packages/sglang/srt/managers/schedule_batch.py", line 1461, in prepare_for_decode
locs = self.seq_lens.clone()
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
I constantly see this error in the validation step before training if I use QwQ-32b
or Qwen3-32b
but everything worked well when training with Qwen2.5-32b-instruct
. To verify if this is actually an OOM issue which is mentioned in SGLang doc, I reduced the validation dataset to only 1 data point but still hit the same error when tp size is 4 or 8. Here is the an overview of the GPU memory usage when the error happened, which does not seem like OOM to me(only ~65% GPU Mem utilization before failure):
I am using up-to-date Verl and have the following dependencies installed:
torch==2.6.0+cu124
torch_memory_saver==0.0.6
transformers==4.51.1
sgl-kernel==0.1.2.post1
sglang==0.4.6.post4
ray==2.46.0
numpy==1.26.4
nvidia-cublas-cu12==12.4.5.8
nvidia-cuda-cupti-cu12==12.4.127
nvidia-cuda-nvrtc-cu12==12.4.127
nvidia-cuda-runtime-cu12==12.4.127
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.2.1.3
nvidia-curand-cu12==10.3.5.147
nvidia-cusolver-cu12==11.6.1.9
nvidia-cusparse-cu12==12.3.1.170
nvidia-cusparselt-cu12==0.6.2
nvidia-ml-py==12.570.86
nvidia-nccl-cu12==2.21.5
nvidia-nvjitlink-cu12==12.4.127
nvidia-nvtx-cu12==12.4.127
flash_attn==2.7.4.post1+cu12torch2.6cxx11abiFALSE
Here is my full training config:
{
"actor_rollout_ref": {
"actor": {
"checkpoint": {
"contents": ["model", "optimizer", "extra"]
},
"clip_ratio": 0.2,
"clip_ratio_c": 3.0,
"clip_ratio_high": 0.2,
"clip_ratio_low": 0.2,
"entropy_coeff": 0,
"fsdp_config": {
"fsdp_size": -1,
"offload_policy": false,
"optimizer_offload": true,
"param_offload": true,
"reshard_after_forward": true,
"wrap_policy": {"min_num_params": 0}
},
"grad_clip": 1.0,
"kl_loss_coef": 0.001,
"kl_loss_type": "low_var_kl",
"loss_agg_mode": "token-mean",
"optim": {
"lr": 1e-6,
"lr_warmup_steps": -1,
"lr_warmup_steps_ratio": 0.0,
"min_lr_ratio": 0.0,
"num_cycles": 0.5,
"total_training_steps": -1,
"warmup_style": "constant",
"weight_decay": 0.01
},
"ppo_epochs": 1,
"ppo_max_token_len_per_gpu": 16384,
"ppo_micro_batch_size": null,
"ppo_micro_batch_size_per_gpu": 1,
"ppo_mini_batch_size": 8,
"shuffle": false,
"strategy": "fsdp",
"ulysses_sequence_parallel_size": 1,
"use_dynamic_bsz": false,
"use_kl_loss": true,
"use_torch_compile": true
},
"hybrid_engine": true,
"model": {
"enable_gradient_checkpointing": true,
"external_lib": null,
"override_config": {},
"path": "/shared/public/elr-models/Qwen/QwQ-32B/976055f8c83f394f35dbd3ab09a285a984907bd0",
"trust_remote_code": false,
"use_fused_kernels": false,
"use_liger": false,
"use_remove_padding": true
},
"ref": {
"fsdp_config": {
"param_offload": true,
"reshard_after_forward": true,
"wrap_policy": {"min_num_params": 0}
},
"log_prob_max_token_len_per_gpu": 16384,
"log_prob_micro_batch_size": null,
"log_prob_micro_batch_size_per_gpu": 32,
"log_prob_use_dynamic_bsz": false,
"strategy": "fsdp",
"ulysses_sequence_parallel_size": 1,
"use_torch_compile": true
},
"rollout": {
"chat_scheduler": null,
"disable_log_stats": true,
"do_sample": true,
"dtype": "bfloat16",
"enable_chunked_prefill": true,
"enforce_eager": true,
"engine_kwargs": {"swap_space": null},
"free_cache_engine": true,
"gpu_memory_utilization": 0.6,
"ignore_eos": false,
"load_format": "dummy_dtensor",
"log_prob_max_token_len_per_gpu": 16384,
"log_prob_micro_batch_size": null,
"log_prob_micro_batch_size_per_gpu": 32,
"log_prob_use_dynamic_bsz": false,
"max_model_len": null,
"max_num_batched_tokens": 8192,
"max_num_seqs": 1024,
"mode": "sync",
"multi_turn": {
"enable": true,
"format": "qwen",
"max_turns": 5,
"tool_config_path": null,
"tool_list": "tools.tool_list"
},
"n": 8,
"name": "sglang_async",
"prompt_length": 4096,
"response_length": 1024,
"temperature": 1.0,
"tensor_model_parallel_size": 8,
"top_k": -1,
"top_p": 1,
"use_fire_sampling": false,
"val_kwargs": {
"do_sample": false,
"n": 1,
"temperature": 0,
"top_k": -1,
"top_p": 1.0
}
}
},
"algorithm": {
"adv_estimator": "grpo",
"gamma": 1.0,
"kl_ctrl": {
"horizon": 10000,
"kl_coef": 0.001,
"target_kl": 0.1,
"type": "fixed"
},
"kl_penalty": "kl",
"lam": 1.0,
"norm_adv_by_std_in_grpo": true,
"use_kl_in_reward": false
},
"critic": {
"checkpoint": {"contents": ["model", "optimizer", "extra"]},
"cliprange_value": 0.5,
"forward_max_token_len_per_gpu": 32768,
"forward_micro_batch_size": null,
"forward_micro_batch_size_per_gpu": null,
"grad_clip": 1.0,
"model": {
"enable_gradient_checkpointing": true,
"external_lib": null,
"fsdp_config": {
"fsdp_size": -1,
"offload_policy": false,
"optimizer_offload": false,
"param_offload": false,
"reshard_after_forward": true,
"wrap_policy": {"min_num_params": 0}
},
"override_config": {},
"path": "~/models/deepseek-llm-7b-chat",
"tokenizer_path": "/shared/public/elr-models/Qwen/QwQ-32B/976055f8c83f394f35dbd3ab09a285a984907bd0",
"trust_remote_code": false,
"use_remove_padding": false
},
"optim": {
"lr": 1e-5,
"lr_warmup_steps_ratio": 0.0,
"min_lr_ratio": null,
"total_training_steps": -1,
"warmup_style": "constant",
"weight_decay": 0.01
},
"ppo_epochs": 1,
"ppo_max_token_len_per_gpu": 32768,
"ppo_micro_batch_size": null,
"ppo_micro_batch_size_per_gpu": null,
"ppo_mini_batch_size": 8,
"rollout_n": 8,
"shuffle": false,
"strategy": "fsdp",
"ulysses_sequence_parallel_size": 1,
"use_dynamic_bsz": false
},
"custom_reward_function": {
"name": "compute_score",
"path": "...",
"reward_kwargs": {}
},
"data": {
"custom_cls": {"name": null, "path": null},
"filter_overlong_prompts": true,
"filter_overlong_prompts_workers": 1,
"image_key": "images",
"max_prompt_length": 4096,
"max_response_length": 1024,
"prompt_key": "prompt",
"return_full_prompt": false,
"return_raw_chat": true,
"return_raw_input_ids": false,
"reward_fn_key": "data_source",
"shuffle": true,
"tokenizer": null,
"train_batch_size": 8,
"train_files": "...",
"truncation": "error",
"val_batch_size": 1,
"val_files": "...",
"video_key": "videos"
},
"ray_init": {
"num_cpus": null
},
"reward_manager": "naive",
"sandbox_fusion": {
"max_concurrent": 64,
"url": null
},
"strategy": "fsdp",
"ulysses_sequence_parallel_size": 1,
"use_dynamic_bsz": false,
"trainer": {
"balance_batch": true,
"critic_warmup": 0,
"default_hdfs_dir": null,
"default_local_dir": "...",
"del_local_ckpt_after_load": false,
"experiment_name": "qwq-32b",
"log_val_generations": 0,
"logger": ["mlflow"],
"max_actor_ckpt_to_keep": null,
"max_critic_ckpt_to_keep": null,
"n_gpus_per_node": 8,
"nnodes": 1,
"project_name": "yajiang-project/multiturn",
"ray_wait_register_center_timeout": 300,
"resume_from_path": null,
"resume_mode": "disable",
"rollout_data_dir": null,
"save_freq": 10,
"test_freq": 10,
"total_epochs": 5,
"total_training_steps": null,
"val_before_train": true,
"validation_data_dir": null
},
"reward_model": {
"enable": false,
"forward_max_token_len_per_gpu": 32768,
"launch_reward_fn_async": false,
"max_length": null,
"micro_batch_size": null,
"micro_batch_size_per_gpu": null,
"model": {
"external_lib": null,
"fsdp_config": {
"fsdp_size": -1,
"param_offload": false,
"reshard_after_forward": true,
"wrap_policy": {"min_num_params": 0}
},
"input_tokenizer": "/shared/public/elr-models/Qwen/QwQ-32B/976055f8c83f394f35dbd3ab09a285a984907bd0",
"path": "~/models/FsfairX-LLaMA3-RM-v0.1",
"trust_remote_code": false,
"use_remove_padding": false
}
}
}
eranhirs, gau-nernst and ligang-cs
Metadata
Metadata
Assignees
Labels
No labels