Skip to content

[SGLang Async Rollout] CUDA error: an illegal memory access was encountered #1611

@jybsuper

Description

@jybsuper

I encountered the following CUDA error while using async SGLang for multiturn GRPO RL.

[2025-05-20 18:43:20 TP3] Scheduler hit an exception: Traceback (most recent call last):
  File "/home/jobuser/.local/lib/python3.10/site-packages/sglang/srt/managers/scheduler.py", line 2269, in run_scheduler_process
    scheduler.event_loop_overlap()
  File "/home/jobuser/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
  File "/home/jobuser/.local/lib/python3.10/site-packages/sglang/srt/managers/scheduler.py", line 658, in event_loop_overlap
    batch = self.get_next_batch_to_run()
  File "/home/jobuser/.local/lib/python3.10/site-packages/sglang/srt/managers/scheduler.py", line 1304, in get_next_batch_to_run
    self.running_batch = self.update_running_batch(self.running_batch)
  File "/home/jobuser/.local/lib/python3.10/site-packages/sglang/srt/managers/scheduler.py", line 1502, in update_running_batch
    batch.prepare_for_decode()
  File "/home/jobuser/.local/lib/python3.10/site-packages/sglang/srt/managers/schedule_batch.py", line 1461, in prepare_for_decode
    locs = self.seq_lens.clone()
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

I constantly see this error in the validation step before training if I use QwQ-32b or Qwen3-32b but everything worked well when training with Qwen2.5-32b-instruct. To verify if this is actually an OOM issue which is mentioned in SGLang doc, I reduced the validation dataset to only 1 data point but still hit the same error when tp size is 4 or 8. Here is the an overview of the GPU memory usage when the error happened, which does not seem like OOM to me(only ~65% GPU Mem utilization before failure):

Image

I am using up-to-date Verl and have the following dependencies installed:

torch==2.6.0+cu124
torch_memory_saver==0.0.6
transformers==4.51.1
sgl-kernel==0.1.2.post1
sglang==0.4.6.post4
ray==2.46.0
numpy==1.26.4
nvidia-cublas-cu12==12.4.5.8
nvidia-cuda-cupti-cu12==12.4.127
nvidia-cuda-nvrtc-cu12==12.4.127
nvidia-cuda-runtime-cu12==12.4.127
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.2.1.3
nvidia-curand-cu12==10.3.5.147
nvidia-cusolver-cu12==11.6.1.9
nvidia-cusparse-cu12==12.3.1.170
nvidia-cusparselt-cu12==0.6.2
nvidia-ml-py==12.570.86
nvidia-nccl-cu12==2.21.5
nvidia-nvjitlink-cu12==12.4.127
nvidia-nvtx-cu12==12.4.127
flash_attn==2.7.4.post1+cu12torch2.6cxx11abiFALSE

Here is my full training config:

{
  "actor_rollout_ref": {
    "actor": {
      "checkpoint": {
        "contents": ["model", "optimizer", "extra"]
      },
      "clip_ratio": 0.2,
      "clip_ratio_c": 3.0,
      "clip_ratio_high": 0.2,
      "clip_ratio_low": 0.2,
      "entropy_coeff": 0,
      "fsdp_config": {
        "fsdp_size": -1,
        "offload_policy": false,
        "optimizer_offload": true,
        "param_offload": true,
        "reshard_after_forward": true,
        "wrap_policy": {"min_num_params": 0}
      },
      "grad_clip": 1.0,
      "kl_loss_coef": 0.001,
      "kl_loss_type": "low_var_kl",
      "loss_agg_mode": "token-mean",
      "optim": {
        "lr": 1e-6,
        "lr_warmup_steps": -1,
        "lr_warmup_steps_ratio": 0.0,
        "min_lr_ratio": 0.0,
        "num_cycles": 0.5,
        "total_training_steps": -1,
        "warmup_style": "constant",
        "weight_decay": 0.01
      },
      "ppo_epochs": 1,
      "ppo_max_token_len_per_gpu": 16384,
      "ppo_micro_batch_size": null,
      "ppo_micro_batch_size_per_gpu": 1,
      "ppo_mini_batch_size": 8,
      "shuffle": false,
      "strategy": "fsdp",
      "ulysses_sequence_parallel_size": 1,
      "use_dynamic_bsz": false,
      "use_kl_loss": true,
      "use_torch_compile": true
    },
    "hybrid_engine": true,
    "model": {
      "enable_gradient_checkpointing": true,
      "external_lib": null,
      "override_config": {},
      "path": "/shared/public/elr-models/Qwen/QwQ-32B/976055f8c83f394f35dbd3ab09a285a984907bd0",
      "trust_remote_code": false,
      "use_fused_kernels": false,
      "use_liger": false,
      "use_remove_padding": true
    },
    "ref": {
      "fsdp_config": {
        "param_offload": true,
        "reshard_after_forward": true,
        "wrap_policy": {"min_num_params": 0}
      },
      "log_prob_max_token_len_per_gpu": 16384,
      "log_prob_micro_batch_size": null,
      "log_prob_micro_batch_size_per_gpu": 32,
      "log_prob_use_dynamic_bsz": false,
      "strategy": "fsdp",
      "ulysses_sequence_parallel_size": 1,
      "use_torch_compile": true
    },
    "rollout": {
      "chat_scheduler": null,
      "disable_log_stats": true,
      "do_sample": true,
      "dtype": "bfloat16",
      "enable_chunked_prefill": true,
      "enforce_eager": true,
      "engine_kwargs": {"swap_space": null},
      "free_cache_engine": true,
      "gpu_memory_utilization": 0.6,
      "ignore_eos": false,
      "load_format": "dummy_dtensor",
      "log_prob_max_token_len_per_gpu": 16384,
      "log_prob_micro_batch_size": null,
      "log_prob_micro_batch_size_per_gpu": 32,
      "log_prob_use_dynamic_bsz": false,
      "max_model_len": null,
      "max_num_batched_tokens": 8192,
      "max_num_seqs": 1024,
      "mode": "sync",
      "multi_turn": {
        "enable": true,
        "format": "qwen",
        "max_turns": 5,
        "tool_config_path": null,
        "tool_list": "tools.tool_list"
      },
      "n": 8,
      "name": "sglang_async",
      "prompt_length": 4096,
      "response_length": 1024,
      "temperature": 1.0,
      "tensor_model_parallel_size": 8,
      "top_k": -1,
      "top_p": 1,
      "use_fire_sampling": false,
      "val_kwargs": {
        "do_sample": false,
        "n": 1,
        "temperature": 0,
        "top_k": -1,
        "top_p": 1.0
      }
    }
  },
  "algorithm": {
    "adv_estimator": "grpo",
    "gamma": 1.0,
    "kl_ctrl": {
      "horizon": 10000,
      "kl_coef": 0.001,
      "target_kl": 0.1,
      "type": "fixed"
    },
    "kl_penalty": "kl",
    "lam": 1.0,
    "norm_adv_by_std_in_grpo": true,
    "use_kl_in_reward": false
  },
  "critic": {
    "checkpoint": {"contents": ["model", "optimizer", "extra"]},
    "cliprange_value": 0.5,
    "forward_max_token_len_per_gpu": 32768,
    "forward_micro_batch_size": null,
    "forward_micro_batch_size_per_gpu": null,
    "grad_clip": 1.0,
    "model": {
      "enable_gradient_checkpointing": true,
      "external_lib": null,
      "fsdp_config": {
        "fsdp_size": -1,
        "offload_policy": false,
        "optimizer_offload": false,
        "param_offload": false,
        "reshard_after_forward": true,
        "wrap_policy": {"min_num_params": 0}
      },
      "override_config": {},
      "path": "~/models/deepseek-llm-7b-chat",
      "tokenizer_path": "/shared/public/elr-models/Qwen/QwQ-32B/976055f8c83f394f35dbd3ab09a285a984907bd0",
      "trust_remote_code": false,
      "use_remove_padding": false
    },
    "optim": {
      "lr": 1e-5,
      "lr_warmup_steps_ratio": 0.0,
      "min_lr_ratio": null,
      "total_training_steps": -1,
      "warmup_style": "constant",
      "weight_decay": 0.01
    },
    "ppo_epochs": 1,
    "ppo_max_token_len_per_gpu": 32768,
    "ppo_micro_batch_size": null,
    "ppo_micro_batch_size_per_gpu": null,
    "ppo_mini_batch_size": 8,
    "rollout_n": 8,
    "shuffle": false,
    "strategy": "fsdp",
    "ulysses_sequence_parallel_size": 1,
    "use_dynamic_bsz": false
  },
  "custom_reward_function": {
    "name": "compute_score",
    "path": "...",
    "reward_kwargs": {}
  },
  "data": {
    "custom_cls": {"name": null, "path": null},
    "filter_overlong_prompts": true,
    "filter_overlong_prompts_workers": 1,
    "image_key": "images",
    "max_prompt_length": 4096,
    "max_response_length": 1024,
    "prompt_key": "prompt",
    "return_full_prompt": false,
    "return_raw_chat": true,
    "return_raw_input_ids": false,
    "reward_fn_key": "data_source",
    "shuffle": true,
    "tokenizer": null,
    "train_batch_size": 8,
    "train_files": "...",
    "truncation": "error",
    "val_batch_size": 1,
    "val_files": "...",
    "video_key": "videos"
  },
  "ray_init": {
    "num_cpus": null
  },
  "reward_manager": "naive",
  "sandbox_fusion": {
    "max_concurrent": 64,
    "url": null
  },
  "strategy": "fsdp",
  "ulysses_sequence_parallel_size": 1,
  "use_dynamic_bsz": false,
  "trainer": {
    "balance_batch": true,
    "critic_warmup": 0,
    "default_hdfs_dir": null,
    "default_local_dir": "...",
    "del_local_ckpt_after_load": false,
    "experiment_name": "qwq-32b",
    "log_val_generations": 0,
    "logger": ["mlflow"],
    "max_actor_ckpt_to_keep": null,
    "max_critic_ckpt_to_keep": null,
    "n_gpus_per_node": 8,
    "nnodes": 1,
    "project_name": "yajiang-project/multiturn",
    "ray_wait_register_center_timeout": 300,
    "resume_from_path": null,
    "resume_mode": "disable",
    "rollout_data_dir": null,
    "save_freq": 10,
    "test_freq": 10,
    "total_epochs": 5,
    "total_training_steps": null,
    "val_before_train": true,
    "validation_data_dir": null
  },
  "reward_model": {
    "enable": false,
    "forward_max_token_len_per_gpu": 32768,
    "launch_reward_fn_async": false,
    "max_length": null,
    "micro_batch_size": null,
    "micro_batch_size_per_gpu": null,
    "model": {
      "external_lib": null,
      "fsdp_config": {
        "fsdp_size": -1,
        "param_offload": false,
        "reshard_after_forward": true,
        "wrap_policy": {"min_num_params": 0}
      },
      "input_tokenizer": "/shared/public/elr-models/Qwen/QwQ-32B/976055f8c83f394f35dbd3ab09a285a984907bd0",
      "path": "~/models/FsfairX-LLaMA3-RM-v0.1",
      "trust_remote_code": false,
      "use_remove_padding": false
    }
  }
}

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions