Skip to content

Commit 9617d09

Browse files
Merge branch 'main' into ds_nokl
2 parents 8346efb + f50e5c2 commit 9617d09

File tree

189 files changed

+8257
-2401
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

189 files changed

+8257
-2401
lines changed

.github/CODEOWNERS

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,29 @@
11
/docs @eric-haibin-lin @zhaochenyang20 @hongpeng-guo
22
/docs/amd_tutorial @yushengsu-thu
33
/docs/slang_multiturn @zhaochenyang20 @SwordFaith
4+
/docs/ascend_tutorial @FightingZhen
45

56
/recipe/dapo @tongyx361 @PeterSH6 @vermouth1992 @tardis-key @FightingZhen @ji-huazhong
67
/recipe/spin @zhaochenyang20
78
/recipe/sppo @zhaochenyang20
89

910
/third_party/sglang @zhaochenyang20 @SwordFaith
1011
/third_party/vllm @PeterSH6 @wuxibin89
12+
1113
/examples/grpo_trainer @vermouth1992 @PeterSH6 @tardis-key @FightingZhen @ji-huazhong
14+
1215
/verl/single_controller @zw0610 @wuxibin89 @hongpeng-guo
1316
/verl/trainer @eric-haibin-lin @vermouth1992 @tongyx361 @PeterSH6
17+
/verl/models/mcore @ISEEKYAN @vermouth1992
18+
/verl/models/transformers @vermouth1992 @PeterSH6 @tardis-key @FightingZhen @ji-huazhong
1419
/verl/workers/engine @eric-haibin-lin @vermouth1992 @ZihengJiang
1520
/verl/workers/roles @eric-haibin-lin @vermouth1992 @ZihengJiang
1621
/verl/workers/engine/fsdp @eric-haibin-lin @vermouth1992 @ZihengJiang
1722
/verl/workers/rollout/vllm_rollout @wuxibin89 @PeterSH6 @chenhaiq
1823
/verl/workers/rollout/sglang_rollout @zhaochenyang20 @SwordFaith @chenhaiq
19-
/verl/models/transformers @vermouth1992 @PeterSH6 @tardis-key @FightingZhen @ji-huazhong
24+
/verl/workers/actor/megatron_actor.py @ISEEKYAN @vermouth1992
25+
/verl/workers/critic/megatron_critic.py @ISEEKYAN @vermouth1992
26+
/verl/workers/megatron_workers.py @ISEEKYAN @vermouth1992
2027

2128
/tests/single_controller @zw0610 @wuxibin89
2229
/tests/trainer @eric-haibin-lin @vermouth1992 @tongyx361 @PeterSH6

.github/workflows/e2e_eval_aime24.yml renamed to .github/workflows/.deprecate/e2e_eval_aime24.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ jobs:
124124
- name: Install the current repository
125125
run: |
126126
pip3 install --no-deps -e .[test,gpu,math]
127-
pip3 install math-verify
127+
pip3 install math-verify transformers==4.56.2
128128
- name: Prepare aime24 dataset
129129
run: |
130130
ray stop --force

.github/workflows/.deprecate/e2e_ppo_trainer.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ jobs:
7777
HF_ENDPOINT: "https://hf-mirror.com"
7878
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
7979
container:
80-
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2
80+
image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
8181
options: --gpus all --shm-size=10g
8282
steps:
8383
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -110,7 +110,7 @@ jobs:
110110
HF_ENDPOINT: "https://hf-mirror.com"
111111
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
112112
container:
113-
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2
113+
image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
114114
options: --gpus all --shm-size=10g
115115
steps:
116116
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ permissions:
7575
contents: read
7676

7777
env:
78-
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2"
78+
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
7979
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
8080

8181
jobs:

.github/workflows/e2e_spin.yml renamed to .github/workflows/.deprecate/e2e_spin.yml

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,24 +52,41 @@ on:
5252
permissions:
5353
contents: read
5454

55+
env:
56+
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
57+
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
58+
5559
# Cancel jobs on the same ref if a new one is triggered
5660
concurrency:
5761
group: ${{ github.workflow }}-${{ github.ref }}
5862
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
5963

6064
jobs:
65+
setup:
66+
if: github.repository_owner == 'volcengine'
67+
runs-on: ubuntu-latest
68+
outputs:
69+
runner-label: ${{ steps.create-runner.outputs.runner-label }}
70+
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
71+
steps:
72+
- uses: actions/checkout@v4
73+
- id: create-runner
74+
uses: volcengine/vemlp-github-runner@v1
75+
with:
76+
mode: "create"
77+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
78+
mlp-image: "${{ env.IMAGE }}"
79+
6180
e2e_spin:
62-
runs-on: [L20x8]
81+
needs: setup
82+
runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
6383
timeout-minutes: 40 # Increase this timeout value as needed
6484
env:
6585
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
6686
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
6787
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
6888
HF_ENDPOINT: "https://hf-mirror.com"
6989
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
70-
container:
71-
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2
72-
options: --gpus all --shm-size=10g
7390
steps:
7491
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
7592
with:
@@ -79,8 +96,24 @@ jobs:
7996
pip3 install -e .[test,gpu,sglang]
8097
- name: Prepare GSM8K dataset
8198
run: |
82-
python3 examples/data_preprocess/gsm8k.py
99+
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
83100
- name: Running the E2E test with the spin algorithm
84101
run: |
85102
ray stop --force
86103
bash tests/special_e2e/run_spin.sh
104+
105+
cleanup:
106+
runs-on: ubuntu-latest
107+
needs:
108+
[
109+
setup,
110+
e2e_spin
111+
]
112+
if: always()
113+
steps:
114+
- id: destroy-runner
115+
uses: volcengine/vemlp-github-runner@v1
116+
with:
117+
mode: "destroy"
118+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
119+
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"

.github/workflows/e2e_sppo.yml renamed to .github/workflows/.deprecate/e2e_sppo.yml

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,19 +55,37 @@ concurrency:
5555
group: ${{ github.workflow }}-${{ github.ref }}
5656
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
5757

58+
env:
59+
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
60+
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
61+
TRANSFORMERS_VERSION: "4.56.2"
62+
5863
jobs:
64+
setup:
65+
if: github.repository_owner == 'volcengine'
66+
runs-on: ubuntu-latest
67+
outputs:
68+
runner-label: ${{ steps.create-runner.outputs.runner-label }}
69+
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
70+
steps:
71+
- uses: actions/checkout@v4
72+
- id: create-runner
73+
uses: volcengine/vemlp-github-runner@v1
74+
with:
75+
mode: "create"
76+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
77+
mlp-image: "${{ env.IMAGE }}"
78+
5979
e2e_sppo:
60-
runs-on: [L20x8]
80+
needs: setup
81+
runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
6182
timeout-minutes: 40 # Increase this timeout value as needed
6283
env:
6384
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
6485
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
6586
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
6687
HF_ENDPOINT: "https://hf-mirror.com"
6788
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
68-
container:
69-
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2
70-
options: --gpus all --shm-size=10g
7189
steps:
7290
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
7391
with:
@@ -77,8 +95,24 @@ jobs:
7795
pip3 install -e .[test,gpu,sglang]
7896
- name: Prepare MATH dataset
7997
run: |
80-
python3 examples/data_preprocess/math_dataset.py
98+
python3 examples/data_preprocess/math_dataset.py --local_dataset_path $HOME/models/hf_data/DigitalLearningGmbH/MATH-lighteval
8199
- name: Running the E2E test with the SPPO algorithm
82100
run: |
83101
ray stop --force
84102
bash tests/special_e2e/run_sppo.sh
103+
104+
cleanup:
105+
runs-on: ubuntu-latest
106+
needs:
107+
[
108+
setup,
109+
e2e_sppo
110+
]
111+
if: always()
112+
steps:
113+
- id: destroy-runner
114+
uses: volcengine/vemlp-github-runner@v1
115+
with:
116+
mode: "destroy"
117+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
118+
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"

.github/workflows/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,4 +66,8 @@ jobs:
6666
with:
6767
mode: "destroy"
6868
faas-url: "${{ env.DYNAMIC_RUNNER_URL }}"
69-
task-id: "${{ needs.setup.outputs.task-id }}"
69+
task-id: "${{ needs.setup.outputs.task-id }}"
70+
```
71+
72+
### Model and Dataset
73+
To avoid CI relies on network, we pre-download dataset on a NFS on the CI machine. The path for models are \${HOME}/models and the path for dataset is \${HOME}/models/hf_data.

.github/workflows/checkpoint_converter.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ jobs:
8181
NO_PROXY: "localhost,127.0.0.1"
8282
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
8383
container:
84-
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2
84+
image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
8585
options: --gpus all --shm-size=10g
8686
steps:
8787
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -92,8 +92,8 @@ jobs:
9292
pip3 install -e .[test]
9393
- name: Download Model to Use
9494
run: |
95-
huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B
96-
huggingface-cli download deepseek-ai/deepseek-coder-1.3b-instruct --local-dir ${HOME}/models/deepseek-ai/deepseek-coder-1.3b-instruct
95+
# huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B
96+
# huggingface-cli download deepseek-ai/deepseek-coder-1.3b-instruct --local-dir ${HOME}/models/deepseek-ai/deepseek-coder-1.3b-instruct
9797
export HF_HUB_OFFLINE=1
9898
- name: Running Huggingface to Megatron dist_ckpt converter (Qwen/Qwen2.5-0.5B)
9999
run: |
@@ -116,7 +116,7 @@ jobs:
116116
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
117117
HF_ENDPOINT: "https://hf-mirror.com"
118118
container:
119-
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2
119+
image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
120120
options: --gpus all --shm-size=10g
121121
steps:
122122
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -127,7 +127,7 @@ jobs:
127127
pip3 install -e .[test]
128128
- name: Download Model to Use
129129
run: |
130-
huggingface-cli download Qwen/Qwen1.5-MoE-A2.7B-Chat --local-dir ${HOME}/models/Qwen/Qwen1.5-MoE-A2.7B-Chat
130+
# huggingface-cli download Qwen/Qwen1.5-MoE-A2.7B-Chat --local-dir ${HOME}/models/Qwen/Qwen1.5-MoE-A2.7B-Chat
131131
export HF_HUB_OFFLINE=1
132132
- name: Running Huggingface to Megatron dist_ckpt CPU converter (Qwen/Qwen1.5-MoE-A2.7B-Chat)
133133
run: |

.github/workflows/e2e_ascend.yml

Lines changed: 21 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -43,22 +43,17 @@ on:
4343
branches:
4444
- main
4545
paths:
46+
- ".github/workflows/e2e_ascend.yml"
4647
- "**/*.py"
48+
- "docs/ascend_tutorial/**"
49+
- "examples/**"
50+
- "recipe/**"
51+
- "tests/special_npu/**"
52+
- "tests/special_sanity/**"
53+
- "verl/**"
54+
- "pyproject.toml"
4755
- "requirements-npu.txt"
48-
# Other entrypoints
49-
- "!examples/**"
50-
- "!tests/**"
51-
- "!verl/trainer/main_*.py"
52-
- "!verl/trainer/fsdp_sft_trainer.py"
53-
# Recipes
54-
- "!recipe/**"
55-
# Entrypoints
56-
- ".github/workflows/e2e_ascend.yml"
57-
- "examples/data_preprocess/gsm8k.py"
58-
- "examples/data_preprocess/geo3k.py"
59-
- "tests/special_e2e/ppo_trainer"
60-
- "verl/trainer/main_ppo.py"
61-
- "verl/trainer/config/ppo_trainer.yaml"
56+
- "setup.py"
6257

6358
# Cancel jobs on the same ref if a new one is triggered
6459
concurrency:
@@ -81,6 +76,8 @@ jobs:
8176
- /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/
8277
- /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info
8378
- /etc/ascend_install.info:/etc/ascend_install.info
79+
- /data00/dataset:/github/home/dataset
80+
- /data00/models:/github/home/models
8481
# Use self-host cache speed up pip and model download
8582
# - /home/action/actions-runner/_work/cache:/github/home/.cache/
8683
options: >-
@@ -109,20 +106,23 @@ jobs:
109106
pip3 install hf_transfer peft
110107
pip3 install -r requirements-npu.txt
111108
pip install -e .
112-
- name: Install torchviison
109+
- name: Install torchvision
113110
run: |
114111
pip install torchvision==0.20.1+cpu --index-url https://download.pytorch.org/whl/cpu
115112
- name: Uninstall Triton
116113
run: |
117114
pip uninstall -y triton
118-
- name: Prepare gsm8k dataset
115+
- name: Preprocess gsm8k dataset
119116
run: |
120-
ray stop --force
121-
python3 examples/data_preprocess/gsm8k.py
122-
- name: Prepare geo3k dataset
117+
python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/dataset/openai/gsm8k
118+
- name: Preprocess geo3k dataset
119+
run: |
120+
python examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/dataset/hiyouga/geometry3k
121+
- name: Running gsm8k e2e qwen3 training tests with PPO on ASCEND NPU
123122
run: |
124123
ray stop --force
125-
python3 examples/data_preprocess/geo3k.py
124+
bash tests/special_npu/run_qwen3_06b_ppo.sh
125+
rm -rf $HOME/ckpts
126126
- name: Running gsm8k e2e training tests with peft sft on ASCEND NPU
127127
run: |
128128
ray stop --force
@@ -143,11 +143,6 @@ jobs:
143143
ray stop --force
144144
bash tests/special_npu/run_qwen2_5_05b_dapo.sh
145145
rm -rf $HOME/ckpts
146-
- name: Running gsm8k e2e qwen3 training tests with GRPO on ASCEND NPU
147-
run: |
148-
ray stop --force
149-
bash tests/special_npu/run_qwen3_06b_grpo.sh
150-
rm -rf $HOME/ckpts
151146
- name: Running gsm8k e2e training tests with GRPO MindSpeed on ASCEND NPU
152147
run: |
153148
ray stop --force
@@ -157,4 +152,4 @@ jobs:
157152
- name: Running NPU profiling unit tests
158153
run: |
159154
ray stop --force
160-
pytest -s -x tests/utils/test_special_mstx_profile.py
155+
pytest -s -x tests/utils/test_special_mstx_profile.py

0 commit comments

Comments
 (0)