-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Description
System Info
💡VERSION:both verl4.1、verl5
python3 -m verl.trainer.main_ppo
algorithm.adv_estimator=grpo
data.train_files=/home/user/wjy/multi-agent-rl/verl/verl/data/gsm8k/train.parquet
data.val_files=/home/user/wjy/multi-agent-rl/verl/verl/data/gsm8k/test.parquet
data.train_batch_size=32
data.max_prompt_length=512
data.max_response_length=512
data.filter_overlong_prompts=True
data.truncation='error'
data.shuffle=False
actor_rollout_ref.model.path=/home/user/wjy/multi-agent-rl/verl/verl/model/Qwen/Qwen2.5-3B-Instruct
actor_rollout_ref.model.use_shm=False
actor_rollout_ref.model.lora_rank=8
actor_rollout_ref.model.lora_alpha=32
actor_rollout_ref.actor.optim.lr=3e-6
actor_rollout_ref.model.use_remove_padding=True
actor_rollout_ref.actor.ppo_mini_batch_size=4
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
actor_rollout_ref.actor.use_kl_loss=True
actor_rollout_ref.actor.kl_loss_coef=0.001
actor_rollout_ref.actor.kl_loss_type=low_var_kl
actor_rollout_ref.actor.entropy_coeff=0
actor_rollout_ref.model.enable_gradient_checkpointing=True
actor_rollout_ref.actor.fsdp_config.param_offload=False
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
actor_rollout_ref.rollout.tensor_model_parallel_size=2
actor_rollout_ref.rollout.name=vllm
actor_rollout_ref.rollout.gpu_memory_utilization=0.6
actor_rollout_ref.rollout.n=4
actor_rollout_ref.rollout.load_format=safetensors
actor_rollout_ref.rollout.layered_summon=True
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2
actor_rollout_ref.ref.fsdp_config.param_offload=True
algorithm.use_kl_in_reward=False
trainer.critic_warmup=0
trainer.logger=['console','tensorboard']
trainer.project_name='verl_grpo_example_gsm8k'
trainer.experiment_name='qwen2.5_3b_grpo_lora'
trainer.n_gpus_per_node=4
trainer.nnodes=1
trainer.save_freq=50
trainer.test_freq=20
trainer.total_epochs=1 $@
🪄Conda list
packages in environment at /home/user/miniconda3/envs/verl:
Name Version Build Channel
_libgcc_mutex 0.1 main
_openmp_mutex 5.1 1_gnu
absl-py 2.3.1 pypi_0 pypi
accelerate 1.10.1 pypi_0 pypi
addict 2.4.0 pypi_0 pypi
aiohappyeyeballs 2.6.1 pypi_0 pypi
aiohttp 3.12.15 pypi_0 pypi
aiohttp-cors 0.8.1 pypi_0 pypi
aiosignal 1.4.0 pypi_0 pypi
airportsdata 20250909 pypi_0 pypi
annotated-types 0.7.0 pypi_0 pypi
anthropic 0.66.0 pypi_0 pypi
antlr4-python3-runtime 4.9.3 pypi_0 pypi
anyio 4.10.0 pypi_0 pypi
argon2-cffi 25.1.0 pypi_0 pypi
argon2-cffi-bindings 25.1.0 pypi_0 pypi
arrow 1.3.0 pypi_0 pypi
astor 0.8.1 pypi_0 pypi
asttokens 3.0.0 pypi_0 pypi
async-lru 2.0.5 pypi_0 pypi
async-timeout 5.0.1 pypi_0 pypi
attrs 25.3.0 pypi_0 pypi
av 15.1.0 pypi_0 pypi
babel 2.17.0 pypi_0 pypi
beautifulsoup4 4.13.5 pypi_0 pypi
blake3 1.0.5 pypi_0 pypi
bleach 6.2.0 pypi_0 pypi
bzip2 1.0.8 h5eee18b_6
ca-certificates 2025.7.15 h06a4308_0
cachetools 5.5.2 pypi_0 pypi
certifi 2025.8.3 pypi_0 pypi
cffi 2.0.0 pypi_0 pypi
cfgv 3.4.0 pypi_0 pypi
charset-normalizer 3.4.3 pypi_0 pypi
click 8.2.1 pypi_0 pypi
cloudpickle 3.1.1 pypi_0 pypi
codetiming 1.4.0 pypi_0 pypi
colorful 0.5.7 pypi_0 pypi
comm 0.2.3 pypi_0 pypi
compressed-tensors 0.9.3 pypi_0 pypi
cuda-bindings 13.0.1 pypi_0 pypi
cuda-pathfinder 1.2.2 pypi_0 pypi
cuda-python 13.0.1 pypi_0 pypi
cupy-cuda12x 13.6.0 pypi_0 pypi
datasets 4.0.0 pypi_0 pypi
debugpy 1.8.16 pypi_0 pypi
decorator 5.2.1 pypi_0 pypi
decord 0.6.0 pypi_0 pypi
defusedxml 0.7.1 pypi_0 pypi
deprecated 1.2.18 pypi_0 pypi
depyf 0.18.0 pypi_0 pypi
dill 0.3.8 pypi_0 pypi
diskcache 5.6.3 pypi_0 pypi
distlib 0.4.0 pypi_0 pypi
distro 1.9.0 pypi_0 pypi
dnspython 2.8.0 pypi_0 pypi
einops 0.8.1 pypi_0 pypi
email-validator 2.3.0 pypi_0 pypi
exceptiongroup 1.3.0 pypi_0 pypi
executing 2.2.1 pypi_0 pypi
fastapi 0.116.1 pypi_0 pypi
fastapi-cli 0.0.10 pypi_0 pypi
fastapi-cloud-cli 0.1.5 pypi_0 pypi
fastjsonschema 2.21.2 pypi_0 pypi
fastrlock 0.8.3 pypi_0 pypi
fastuuid 0.12.0 pypi_0 pypi
filelock 3.19.1 pypi_0 pypi
flash-attn 2.7.4.post1 pypi_0 pypi
flashinfer-python 0.2.2.post1+cu124torch2.6 pypi_0 pypi
fqdn 1.5.1 pypi_0 pypi
frozendict 2.4.6 pypi_0 pypi
frozenlist 1.7.0 pypi_0 pypi
fsspec 2025.3.0 pypi_0 pypi
gguf 0.17.1 pypi_0 pypi
gitdb 4.0.12 pypi_0 pypi
gitpython 3.1.45 pypi_0 pypi
google-api-core 2.25.1 pypi_0 pypi
google-auth 2.40.3 pypi_0 pypi
googleapis-common-protos 1.70.0 pypi_0 pypi
grpcio 1.74.0 pypi_0 pypi
h11 0.16.0 pypi_0 pypi
hf-transfer 0.1.9 pypi_0 pypi
hf-xet 1.1.9 pypi_0 pypi
httpcore 1.0.9 pypi_0 pypi
httptools 0.6.4 pypi_0 pypi
httpx 0.28.1 pypi_0 pypi
huggingface-hub 0.34.4 pypi_0 pypi
hydra-core 1.3.2 pypi_0 pypi
identify 2.6.14 pypi_0 pypi
idna 3.10 pypi_0 pypi
importlib-metadata 8.0.0 pypi_0 pypi
iniconfig 2.1.0 pypi_0 pypi
interegular 0.3.3 pypi_0 pypi
ipdb 0.13.13 pypi_0 pypi
ipykernel 6.30.1 pypi_0 pypi
ipython 8.37.0 pypi_0 pypi
ipywidgets 8.1.7 pypi_0 pypi
isoduration 20.11.0 pypi_0 pypi
jedi 0.19.2 pypi_0 pypi
jinja2 3.1.6 pypi_0 pypi
jiter 0.10.0 pypi_0 pypi
json5 0.12.1 pypi_0 pypi
jsonpointer 3.0.0 pypi_0 pypi
jsonschema 4.25.1 pypi_0 pypi
jsonschema-specifications 2025.9.1 pypi_0 pypi
jupyter 1.1.1 pypi_0 pypi
jupyter-client 8.6.3 pypi_0 pypi
jupyter-console 6.6.3 pypi_0 pypi
jupyter-core 5.8.1 pypi_0 pypi
jupyter-events 0.12.0 pypi_0 pypi
jupyter-lsp 2.3.0 pypi_0 pypi
jupyter-server 2.17.0 pypi_0 pypi
jupyter-server-terminals 0.5.3 pypi_0 pypi
jupyterlab 4.4.7 pypi_0 pypi
jupyterlab-pygments 0.3.0 pypi_0 pypi
jupyterlab-server 2.27.3 pypi_0 pypi
jupyterlab-widgets 3.0.15 pypi_0 pypi
lark 1.2.2 pypi_0 pypi
ld_impl_linux-64 2.40 h12ee557_0
libffi 3.3 he6710b0_2
libgcc-ng 11.2.0 h1234567_1
libgomp 11.2.0 h1234567_1
libstdcxx-ng 11.2.0 h1234567_1
libuuid 1.41.5 h5eee18b_0
libxcb 1.17.0 h9b100fa_0
liger-kernel 0.6.2 pypi_0 pypi
litellm 1.76.3 pypi_0 pypi
llguidance 0.7.30 pypi_0 pypi
llvmlite 0.44.0 pypi_0 pypi
lm-format-enforcer 0.10.12 pypi_0 pypi
markdown 3.9 pypi_0 pypi
markdown-it-py 4.0.0 pypi_0 pypi
markupsafe 3.0.2 pypi_0 pypi
mathruler 0.1.0 pypi_0 pypi
matplotlib-inline 0.1.7 pypi_0 pypi
mdurl 0.1.2 pypi_0 pypi
megatron-core 0.12.2 pypi_0 pypi
mistral-common 1.8.4 pypi_0 pypi
mistune 3.1.4 pypi_0 pypi
modelscope 1.29.2 pypi_0 pypi
mpmath 1.3.0 pypi_0 pypi
msgpack 1.1.1 pypi_0 pypi
msgspec 0.19.0 pypi_0 pypi
multidict 6.6.4 pypi_0 pypi
multiprocess 0.70.16 pypi_0 pypi
nanobind 2.9.2 pypi_0 pypi
nbclient 0.10.2 pypi_0 pypi
nbconvert 7.16.6 pypi_0 pypi
nbformat 5.10.4 pypi_0 pypi
ncurses 6.5 h7934f7d_0
nest-asyncio 1.6.0 pypi_0 pypi
networkx 3.4.2 pypi_0 pypi
ninja 1.13.0 pypi_0 pypi
nodeenv 1.9.1 pypi_0 pypi
notebook 7.4.5 pypi_0 pypi
notebook-shim 0.2.4 pypi_0 pypi
numba 0.61.2 pypi_0 pypi
numpy 2.2.6 pypi_0 pypi
nvidia-cublas-cu12 12.4.5.8 pypi_0 pypi
nvidia-cuda-cupti-cu12 12.4.127 pypi_0 pypi
nvidia-cuda-nvrtc-cu12 12.4.127 pypi_0 pypi
nvidia-cuda-runtime-cu12 12.4.127 pypi_0 pypi
nvidia-cudnn-cu12 9.8.0.87 pypi_0 pypi
nvidia-cufft-cu12 11.2.1.3 pypi_0 pypi
nvidia-curand-cu12 10.3.5.147 pypi_0 pypi
nvidia-cusolver-cu12 11.6.1.9 pypi_0 pypi
nvidia-cusparse-cu12 12.3.1.170 pypi_0 pypi
nvidia-cusparselt-cu12 0.6.2 pypi_0 pypi
nvidia-ml-py 13.580.65 pypi_0 pypi
nvidia-nccl-cu12 2.21.5 pypi_0 pypi
nvidia-nvjitlink-cu12 12.4.127 pypi_0 pypi
nvidia-nvtx-cu12 12.4.127 pypi_0 pypi
nvitop 1.5.3 pypi_0 pypi
omegaconf 2.3.0 pypi_0 pypi
openai 1.107.0 pypi_0 pypi
opencensus 0.11.4 pypi_0 pypi
opencensus-context 0.1.3 pypi_0 pypi
opencv-fixer 0.2.5 pypi_0 pypi
opencv-python 4.12.0.88 pypi_0 pypi
opencv-python-headless 4.12.0.88 pypi_0 pypi
openssl 1.1.1w h7f8727e_0
opentelemetry-api 1.36.0 pypi_0 pypi
opentelemetry-exporter-otlp 1.26.0 pypi_0 pypi
opentelemetry-exporter-otlp-proto-common 1.26.0 pypi_0 pypi
opentelemetry-exporter-otlp-proto-grpc 1.26.0 pypi_0 pypi
opentelemetry-exporter-otlp-proto-http 1.26.0 pypi_0 pypi
opentelemetry-exporter-prometheus 0.57b0 pypi_0 pypi
opentelemetry-proto 1.26.0 pypi_0 pypi
opentelemetry-sdk 1.36.0 pypi_0 pypi
opentelemetry-semantic-conventions 0.57b0 pypi_0 pypi
opentelemetry-semantic-conventions-ai 0.4.13 pypi_0 pypi
optree 0.17.0 pypi_0 pypi
orjson 3.11.3 pypi_0 pypi
outlines 0.1.11 pypi_0 pypi
outlines-core 0.1.26 pypi_0 pypi
overrides 7.7.0 pypi_0 pypi
packaging 25.0 pypi_0 pypi
pandas 2.3.2 pypi_0 pypi
pandocfilters 1.5.1 pypi_0 pypi
parso 0.8.5 pypi_0 pypi
partial-json-parser 0.2.1.1.post6 pypi_0 pypi
peft 0.17.1 pypi_0 pypi
pexpect 4.9.0 pypi_0 pypi
pillow 11.3.0 pypi_0 pypi
pip 25.2 pyhc872135_0
platformdirs 4.4.0 pypi_0 pypi
pluggy 1.6.0 pypi_0 pypi
pre-commit 4.3.0 pypi_0 pypi
prometheus-client 0.22.1 pypi_0 pypi
prometheus-fastapi-instrumentator 7.1.0 pypi_0 pypi
prompt-toolkit 3.0.52 pypi_0 pypi
propcache 0.3.2 pypi_0 pypi
proto-plus 1.26.1 pypi_0 pypi
protobuf 4.25.8 pypi_0 pypi
psutil 7.0.0 pypi_0 pypi
pthread-stubs 0.3 h0ce48e5_1
ptyprocess 0.7.0 pypi_0 pypi
pure-eval 0.2.3 pypi_0 pypi
py-cpuinfo 9.0.0 pypi_0 pypi
py-spy 0.4.1 pypi_0 pypi
pyarrow 21.0.0 pypi_0 pypi
pyasn1 0.6.1 pypi_0 pypi
pyasn1-modules 0.4.2 pypi_0 pypi
pybind11 3.0.1 pypi_0 pypi
pycountry 24.6.1 pypi_0 pypi
pycparser 2.22 pypi_0 pypi
pydantic 2.11.7 pypi_0 pypi
pydantic-core 2.33.2 pypi_0 pypi
pydantic-extra-types 2.10.5 pypi_0 pypi
pyext 0.7 pypi_0 pypi
pygments 2.19.2 pypi_0 pypi
pylatexenc 2.10 pypi_0 pypi
pynvml 13.0.1 pypi_0 pypi
pytest 8.4.2 pypi_0 pypi
python 3.10.0 h12debd9_5
python-dateutil 2.9.0.post0 pypi_0 pypi
python-dotenv 1.1.1 pypi_0 pypi
python-json-logger 3.3.0 pypi_0 pypi
python-multipart 0.0.20 pypi_0 pypi
pytz 2025.2 pypi_0 pypi
pyyaml 6.0.2 pypi_0 pypi
pyzmq 27.1.0 pypi_0 pypi
qwen-vl-utils 0.0.11 pypi_0 pypi
ray 2.49.1 pypi_0 pypi
readline 8.3 hc2a1206_0
referencing 0.36.2 pypi_0 pypi
regex 2025.9.1 pypi_0 pypi
requests 2.32.5 pypi_0 pypi
rfc3339-validator 0.1.4 pypi_0 pypi
rfc3986-validator 0.1.1 pypi_0 pypi
rfc3987-syntax 1.1.0 pypi_0 pypi
rich 14.1.0 pypi_0 pypi
rich-toolkit 0.15.1 pypi_0 pypi
rignore 0.6.4 pypi_0 pypi
rpds-py 0.27.1 pypi_0 pypi
rsa 4.9.1 pypi_0 pypi
ruff 0.12.12 pypi_0 pypi
safetensors 0.6.2 pypi_0 pypi
scipy 1.15.3 pypi_0 pypi
send2trash 1.8.3 pypi_0 pypi
sentencepiece 0.2.1 pypi_0 pypi
sentry-sdk 2.37.0 pypi_0 pypi
setproctitle 1.3.7 pypi_0 pypi
setuptools 78.1.1 py310h06a4308_0
sgl-kernel 0.1.0 pypi_0 pypi
sglang 0.4.6.post1 pypi_0 pypi
shellingham 1.5.4 pypi_0 pypi
six 1.17.0 pypi_0 pypi
smart-open 7.3.1 pypi_0 pypi
smmap 5.0.2 pypi_0 pypi
sniffio 1.3.1 pypi_0 pypi
soundfile 0.13.1 pypi_0 pypi
soupsieve 2.8 pypi_0 pypi
sqlite 3.50.2 hb25bd0a_1
stack-data 0.6.3 pypi_0 pypi
starlette 0.47.3 pypi_0 pypi
sympy 1.13.1 pypi_0 pypi
tensorboard 2.20.0 pypi_0 pypi
tensorboard-data-server 0.7.2 pypi_0 pypi
tensordict 0.6.2 pypi_0 pypi
terminado 0.18.1 pypi_0 pypi
tiktoken 0.11.0 pypi_0 pypi
tinycss2 1.4.0 pypi_0 pypi
tk 8.6.15 h54e0aa7_0
tokenizers 0.21.4 pypi_0 pypi
tomli 2.2.1 pypi_0 pypi
torch 2.6.0 pypi_0 pypi
torch-memory-saver 0.0.8 pypi_0 pypi
torchao 0.13.0 pypi_0 pypi
torchaudio 2.6.0 pypi_0 pypi
torchdata 0.11.0 pypi_0 pypi
torchvision 0.21.0 pypi_0 pypi
tornado 6.5.2 pypi_0 pypi
tqdm 4.67.1 pypi_0 pypi
traitlets 5.14.3 pypi_0 pypi
transformers 4.51.1 pypi_0 pypi
triton 3.2.0 pypi_0 pypi
typer 0.17.4 pypi_0 pypi
types-python-dateutil 2.9.0.20250822 pypi_0 pypi
typing-extensions 4.15.0 pypi_0 pypi
typing-inspection 0.4.1 pypi_0 pypi
tzdata 2025.2 pypi_0 pypi
uri-template 1.3.0 pypi_0 pypi
urllib3 2.5.0 pypi_0 pypi
uvicorn 0.35.0 pypi_0 pypi
uvloop 0.21.0 pypi_0 pypi
verl 0.5.0.dev0 pypi_0 pypi
virtualenv 20.34.0 pypi_0 pypi
vllm 0.8.5.post1 pypi_0 pypi
wandb 0.21.3 pypi_0 pypi
watchfiles 1.1.0 pypi_0 pypi
wcwidth 0.2.13 pypi_0 pypi
webcolors 24.11.1 pypi_0 pypi
webencodings 0.5.1 pypi_0 pypi
websocket-client 1.8.0 pypi_0 pypi
websockets 15.0.1 pypi_0 pypi
werkzeug 3.1.3 pypi_0 pypi
wheel 0.45.1 py310h06a4308_0
widgetsnbextension 4.0.14 pypi_0 pypi
wrapt 1.17.3 pypi_0 pypi
xformers 0.0.29.post2 pypi_0 pypi
xgrammar 0.1.18 pypi_0 pypi
xorg-libx11 1.8.12 h9b100fa_1
xorg-libxau 1.0.12 h9b100fa_0
xorg-libxdmcp 1.1.5 h9b100fa_0
xorg-xorgproto 2024.1 h5eee18b_1
xxhash 3.5.0 pypi_0 pypi
xz 5.6.4 h5eee18b_1
yarl 1.20.1 pypi_0 pypi
zipp 3.23.0 pypi_0 pypi
zlib 1.2.13 h5eee18b_1
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
Training Progress: 3%|▎ | 7/233 [11:56<6:25:43, 102.41s/it]
(WorkerDict pid=2862157) [rank3]:[E923 11:14:11.615370309 ProcessGroupNCCL.cpp:1895] [PG ID 0 PG GUID 0(default_pg) Rank 3] Process group watchdog thread terminated with exception: CUDA error: misaligned address
(WorkerDict pid=2862157) CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
(WorkerDict pid=2862157) For debugging consider passing CUDA_LAUNCH_BLOCKING=1
(WorkerDict pid=2862157) Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
(WorkerDict pid=2862157)
(WorkerDict pid=2862157) Exception raised from c10_cuda_check_implementation at /pytorch/c10/cuda/CUDAException.cpp:43 (most recent call first):
(WorkerDict pid=2862157) frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x75e24436c1b6 in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libc10.so)
(WorkerDict pid=2862157) frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x75e244315a76 in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libc10.so)
(WorkerDict pid=2862157) frame #2: c10::cuda::c10_cuda_check_implementation(int, char const*, char const*, int, bool) + 0x118 (0x75e24469c918 in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libc10_cuda.so)
(WorkerDict pid=2862157) frame #3: c10d::ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const + 0x56 (0x75cf45bf1556 in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
(WorkerDict pid=2862157) frame #4: c10d::ProcessGroupNCCL::WorkNCCL::isCompleted() + 0xa0 (0x75cf45bfe8c0 in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
(WorkerDict pid=2862157) frame #5: c10d::ProcessGroupNCCL::watchdogHandler() + 0x617 (0x75cf45c00557 in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
(WorkerDict pid=2862157) frame #6: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x75cf45c016ed in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
(WorkerDict pid=2862157) frame #7: + 0xdbbf4 (0x75e2710dbbf4 in /home/user/miniconda3/envs/verl/bin/../lib/libstdc++.so.6)
(WorkerDict pid=2862157) frame #8: + 0x94ac3 (0x75e273c94ac3 in /lib/x86_64-linux-gnu/libc.so.6)
(WorkerDict pid=2862157) frame #9: + 0x126850 (0x75e273d26850 in /lib/x86_64-linux-gnu/libc.so.6)
(WorkerDict pid=2862157)
(WorkerDict pid=2862157) [2025-09-23 11:14:11,483 E 2862157 2863292] logging.cc:118: Unhandled exception: N3c1016DistBackendErrorE. what(): [PG ID 0 PG GUID 0(default_pg) Rank 3] Process group watchdog thread terminated with exception: CUDA error: misaligned address
(WorkerDict pid=2862157) CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
(WorkerDict pid=2862157) For debugging consider passing CUDA_LAUNCH_BLOCKING=1
(WorkerDict pid=2862157) Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
(WorkerDict pid=2862157)
(WorkerDict pid=2862157) Exception raised from c10_cuda_check_implementation at /pytorch/c10/cuda/CUDAException.cpp:43 (most recent call first):
(WorkerDict pid=2862157) frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x75e24436c1b6 in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libc10.so)
(WorkerDict pid=2862157) frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x75e244315a76 in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libc10.so)
(WorkerDict pid=2862157) frame #2: c10::cuda::c10_cuda_check_implementation(int, char const*, char const*, int, bool) + 0x118 (0x75e24469c918 in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libc10_cuda.so)
(WorkerDict pid=2862157) frame #3: c10d::ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const + 0x56 (0x75cf45bf1556 in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
(WorkerDict pid=2862157) frame #4: c10d::ProcessGroupNCCL::WorkNCCL::isCompleted() + 0xa0 (0x75cf45bfe8c0 in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
(WorkerDict pid=2862157) frame #5: c10d::ProcessGroupNCCL::watchdogHandler() + 0x617 (0x75cf45c00557 in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
(WorkerDict pid=2862157) frame #6: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x75cf45c016ed in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
(WorkerDict pid=2862157) frame #7: + 0xdbbf4 (0x75e2710dbbf4 in /home/user/miniconda3/envs/verl/bin/../lib/libstdc++.so.6)
(WorkerDict pid=2862157) frame #8: + 0x94ac3 (0x75e273c94ac3 in /lib/x86_64-linux-gnu/libc.so.6)
(WorkerDict pid=2862157) frame #9: + 0x126850 (0x75e273d26850 in /lib/x86_64-linux-gnu/libc.so.6)
(WorkerDict pid=2862157)
(WorkerDict pid=2862157) Exception raised from ncclCommWatchdog at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1901 (most recent call first):
(WorkerDict pid=2862157) frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x75e24436c1b6 in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libc10.so)
(WorkerDict pid=2862157) frame #1: + 0xe5c6fc (0x75cf4585c6fc in /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
(WorkerDict pid=2862157) frame #2: + 0xdbbf4 (0x75e2710dbbf4 in /home/user/miniconda3/envs/verl/bin/../lib/libstdc++.so.6)
(WorkerDict pid=2862157) frame #3: + 0x94ac3 (0x75e273c94ac3 in /lib/x86_64-linux-gnu/libc.so.6)
(WorkerDict pid=2862157) frame #4: + 0x126850 (0x75e273d26850 in /lib/x86_64-linux-gnu/libc.so.6)
(WorkerDict pid=2862157)
(WorkerDict pid=2862157) [2025-09-23 11:14:11,519 E 2862157 2863292] logging.cc:125: Stack trace:
(WorkerDict pid=2862157) /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/ray/_raylet.so(+0x150e11a) [0x75e27290e11a] ray::operator<<()
(WorkerDict pid=2862157) /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/ray/_raylet.so(+0x1511022) [0x75e272911022] ray::TerminateHandler()
(WorkerDict pid=2862157) /home/user/miniconda3/envs/verl/bin/../lib/libstdc++.so.6(+0xb135a) [0x75e2710b135a] __cxxabiv1::__terminate()
(WorkerDict pid=2862157) /home/user/miniconda3/envs/verl/bin/../lib/libstdc++.so.6(+0xb13c5) [0x75e2710b13c5]
(WorkerDict pid=2862157) /home/user/miniconda3/envs/verl/bin/../lib/libstdc++.so.6(+0xb134f) [0x75e2710b134f]
(WorkerDict pid=2862157) /home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so(+0xe5c7aa) [0x75cf4585c7aa] c10d::ProcessGroupNCCL::ncclCommWatchdog()
(WorkerDict pid=2862157) /home/user/miniconda3/envs/verl/bin/../lib/libstdc++.so.6(+0xdbbf4) [0x75e2710dbbf4] execute_native_thread_routine
(WorkerDict pid=2862157) /lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x75e273c94ac3]
(WorkerDict pid=2862157) /lib/x86_64-linux-gnu/libc.so.6(+0x126850) [0x75e273d26850]
(WorkerDict pid=2862157)
(WorkerDict pid=2862157) *** SIGABRT received at time=1758597251 on cpu 124 ***
(WorkerDict pid=2862157) PC: @ 0x75e273c969fc (unknown) pthread_kill
(WorkerDict pid=2862157) @ 0x75e273c42520 (unknown) (unknown)
(WorkerDict pid=2862157) [2025-09-23 11:14:11,519 E 2862157 2863292] logging.cc:474: *** SIGABRT received at time=1758597251 on cpu 124 ***
(WorkerDict pid=2862157) [2025-09-23 11:14:11,519 E 2862157 2863292] logging.cc:474: PC: @ 0x75e273c969fc (unknown) pthread_kill
(WorkerDict pid=2862157) [2025-09-23 11:14:11,519 E 2862157 2863292] logging.cc:474: @ 0x75e273c42520 (unknown) (unknown)
(WorkerDict pid=2862157) Fatal Python error: Aborted
Traceback (most recent call last):
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/trainer/main_ppo.py", line 42, in main
run_ppo(config)
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/trainer/main_ppo.py", line 85, in run_ppo
ray.get(runner.run.remote(config))
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/ray/_private/worker.py", line 2882, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/ray/_private/worker.py", line 968, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::TaskRunner.run() (pid=2854646, ip=10.254.30.2, actor_id=9a301de0224cf11a0ab44e5101000000, repr=<main_ppo.TaskRunner object at 0x78ef43dcaad0>)
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/trainer/main_ppo.py", line 316, in run
trainer.fit()
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/trainer/ppo/ray_trainer.py", line 1066, in fit
ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/single_controller/ray/base.py", line 48, in call
output = ray.get(output)
ray.exceptions.RayTaskError(RuntimeError): ray::WorkerDict.actor_rollout_compute_ref_log_prob() (pid=2862157, ip=10.254.30.2, actor_id=ff149bfe6823f581370adc9a01000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x75ce89da8040>)
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/single_controller/ray/base.py", line 701, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/single_controller/base/decorator.py", line 430, in inner
return func(*args, **kwargs)
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/utils/profiler/profile.py", line 256, in wrapper
return func(self_instance, *args, **kwargs_inner)
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/workers/fsdp_workers.py", line 921, in compute_ref_log_prob
data = self.compute_log_prob(data)
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/single_controller/base/decorator.py", line 430, in inner
return func(*args, **kwargs)
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/utils/profiler/profile.py", line 256, in wrapper
return func(self_instance, *args, **kwargs_inner)
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/workers/fsdp_workers.py", line 896, in compute_log_prob
output, entropys = self.actor.compute_log_prob(data=data, calculate_entropy=True)
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/utils/profiler/performance.py", line 105, in f
return self.log(decorated_function, *args, **kwargs)
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/utils/profiler/performance.py", line 118, in log
output = func(*args, **kwargs)
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/workers/actor/dp_actor.py", line 343, in compute_log_prob
entropy, log_probs = self._forward_micro_batch(
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/workers/actor/dp_actor.py", line 177, in _forward_micro_batch
output = self.actor_module(
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/peft/peft_model.py", line 1850, in forward
return self.base_model(
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 222, in forward
return self.model.forward(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/transformers/utils/generic.py", line 965, in wrapper
output = func(self, *args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
return func(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 823, in forward
outputs: BaseModelOutputWithPast = self.model(
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/transformers/utils/generic.py", line 965, in wrapper
output = func(self, *args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 549, in forward
layer_outputs = decoder_layer(
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 262, in forward
hidden_states, self_attn_weights = self.self_attn(
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 194, in forward
attn_output, attn_weights = attention_interface(
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/transformers/integrations/flash_attention.py", line 49, in flash_attention_forward
attn_output = _flash_attention_forward(
File "/home/user/wjy/multi-agent-rl/verl/verl/verl/models/transformers/monkey_patch.py", line 101, in _ulysses_flash_attention_forward
attn_output = _flash_attention_forward(
File "/home/user/miniconda3/envs/verl/lib/python3.10/site-packages/transformers/modeling_flash_attention_utils.py", line 377, in _flash_attention_forward
elif position_ids is not None and (
RuntimeError: CUDA error: misaligned address
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
when running, It has a WARN,sometiones: rank 0 grad_norm is not finite: inf
WARN: rank 3 grad_norm is not finite: inf
Expected behavior
Regardless of whether I use verl4.1 or verl5, this issue occurs. I have tried setting all offload parameters to true, but the problem still persists. I am using four 3090 GPUs, and this issue has been troubling me for two weeks😭. Could someone please help solve this? Is this a problem with the graphics cards?