Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
162aaca
BUMP FA aligned with SGL-KERNEL
johnnynunez Oct 7, 2025
4f281c7
BUILD CUDA 13
johnnynunez Oct 7, 2025
0b34d78
BUILD CUDA 13
johnnynunez Oct 7, 2025
28bbd2a
decord2
johnnynunez Oct 7, 2025
3890637
fix typo
johnnynunez Oct 7, 2025
ba0294e
Merge branch 'main' into main
johnnynunez Oct 7, 2025
771a614
fix typo
johnnynunez Oct 7, 2025
4793f03
Merge remote-tracking branch 'origin/main'
johnnynunez Oct 7, 2025
6054344
Merge branch 'main' into main
johnnynunez Oct 7, 2025
b7a4f36
Merge branch 'main' into main
johnnynunez Oct 8, 2025
9a2c0b2
Merge branch 'main' into main
johnnynunez Oct 8, 2025
1d5cb44
build cu129 ok, now test pr test cu130
johnnynunez Oct 8, 2025
1be9fa5
Merge branch 'main' into main
johnnynunez Oct 8, 2025
95d1a3e
Merge branch 'main' into main
johnnynunez Oct 8, 2025
c4f9a20
test bump fa
johnnynunez Oct 8, 2025
20c5a82
Update CMakeLists.txt
johnnynunez Oct 8, 2025
5419b85
Merge branch 'main' into main
johnnynunez Oct 9, 2025
800c6e3
fix
johnnynunez Oct 9, 2025
126acc6
Merge remote-tracking branch 'origin/main'
johnnynunez Oct 9, 2025
74d1416
fix
johnnynunez Oct 9, 2025
4048eab
fix tests
johnnynunez Oct 9, 2025
9ee6bac
bump docker
johnnynunez Oct 9, 2025
ed65fb1
bump torch
johnnynunez Oct 9, 2025
7b6d88f
Merge branch 'main' into main
johnnynunez Oct 9, 2025
6a53e65
bump gdrcopy compatible with Blackwell
johnnynunez Oct 9, 2025
1860c3f
bump gdrcopy compatible with Blackwell
johnnynunez Oct 9, 2025
11ca260
bump gdrcopy compatible with Blackwell
johnnynunez Oct 9, 2025
072add5
add support gdrcopy for GB200,GB300,Thor,Spark
johnnynunez Oct 9, 2025
0f94fc2
Merge branch 'main' into main
johnnynunez Oct 9, 2025
49acd56
upgrade cuda 13
johnnynunez Oct 9, 2025
abfeb61
Update ci_install_deepep.sh
johnnynunez Oct 9, 2025
98682db
Merge branch 'main' into main
johnnynunez Oct 9, 2025
1982c4f
revert
johnnynunez Oct 10, 2025
41b0c19
Merge remote-tracking branch 'origin/main'
johnnynunez Oct 10, 2025
fb2b87c
Merge branch 'main' into main
johnnynunez Oct 10, 2025
f065618
remove thor in cu12
johnnynunez Oct 10, 2025
de4d952
revert
johnnynunez Oct 10, 2025
e67a841
revert
johnnynunez Oct 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 168 additions & 0 deletions .github/workflows/release-whl-kernel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,171 @@ jobs:
git add -A
git commit -m "update whl index"
git push

build-cu130:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make it a matrix to unify cu130 and cu130-aarch64?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, i can. I’m testing at this moment sgl-flash-attention

if: github.repository == 'sgl-project/sglang'
runs-on: x64-kernel-build-node
strategy:
matrix:
python-version: [ "3.10" ]
cuda-version: [ "13.0" ]
steps:
- uses: actions/checkout@v4
with:
submodules: "recursive"

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Build wheels
run: |
cd sgl-kernel
chmod +x ./build.sh
./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"

- name: Upload to PyPI
working-directory: sgl-kernel
run: |
pip install twine
python3 -m twine upload --skip-existing dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}

- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
path: sgl-kernel/dist/*

release-cu130:
needs: build-cu130
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-*

- name: Set tag name
id: set_tag_name
run: |
if [ -z "${{ inputs.tag_name }}" ]; then
TAG_NAME="v$(cat sgl-kernel/python/sgl_kernel/version.py | cut -d'"' -f2)"
echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
else
echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT
fi

- name: Release
uses: softprops/action-gh-release@v2
with:
tag_name: ${{ steps.set_tag_name.outputs.tag_name }}
repository: sgl-project/whl
token: ${{ secrets.WHL_TOKEN }}
files: |
sgl-kernel/dist/*

- name: Clone wheel index
run: git clone https://oauth2:${WHL_TOKEN}@github.com/sgl-project/whl.git sgl-whl
env:
WHL_TOKEN: ${{ secrets.WHL_TOKEN }}

- name: Update wheel index
run: python3 scripts/update_kernel_whl_index.py --cuda 130

- name: Push wheel index
run: |
cd sgl-whl
git config --local user.name "sglang-bot"
git config --local user.email "[email protected]"
git add -A
git commit -m "update whl index"
git push

build-cu130-aarch64:
if: github.repository == 'sgl-project/sglang'
runs-on: arm-kernel-build-node
strategy:
matrix:
python-version: [ "3.10" ]
cuda-version: [ "13.0" ]
steps:
- uses: actions/checkout@v4
with:
submodules: "recursive"

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Build wheels
run: |
cd sgl-kernel
chmod +x ./build.sh
./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}" aarch64

- name: Upload to PyPI
working-directory: sgl-kernel
run: |
pip install twine
python3 -m twine upload --skip-existing dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}

- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}-aarch64
path: sgl-kernel/dist/*

release-cu130-aarch64:
needs: build-cu130-aarch64
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-*

- name: Set tag name
id: set_tag_name
run: |
if [ -z "${{ inputs.tag_name }}" ]; then
TAG_NAME="v$(cat sgl-kernel/python/sgl_kernel/version.py | cut -d'"' -f2)"
echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
else
echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT
fi

- name: Release
uses: softprops/action-gh-release@v2
with:
tag_name: ${{ steps.set_tag_name.outputs.tag_name }}
repository: sgl-project/whl
token: ${{ secrets.WHL_TOKEN }}
files: |
sgl-kernel/dist/*

- name: Clone wheel index
run: git clone https://oauth2:${WHL_TOKEN}@github.com/sgl-project/whl.git sgl-whl
env:
WHL_TOKEN: ${{ secrets.WHL_TOKEN }}

- name: Update wheel index
run: python3 scripts/update_kernel_whl_index.py --cuda 130

- name: Push wheel index
run: |
cd sgl-whl
git config --local user.name "sglang-bot"
git config --local user.email "[email protected]"
git add -A
git commit -m "update whl index"
git push
7 changes: 4 additions & 3 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG CUDA_VERSION=12.9.1
ARG CUDA_VERSION=13.0.1
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 AS base
ARG TARGETARCH

Expand Down Expand Up @@ -83,12 +83,13 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
12.6.1) CUINDEX=126 ;; \
12.8.1) CUINDEX=128 ;; \
12.9.1) CUINDEX=129 ;; \
13.0.1) CUINDEX=130 ;; \
*) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
esac \
&& if [ "$CUDA_VERSION" = "12.6.1" ]; then \
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
fi \
&& if [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ]; then \
&& if [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ] || [ "$CUDA_VERSION" = "13.0.1" ]; then \
python3 -m pip install --no-cache-dir sgl-kernel==${SGL_KERNEL_VERSION} ; \
fi \
&& python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
Expand Down Expand Up @@ -130,7 +131,7 @@ RUN cd /sgl-workspace/DeepEP && \
12.6.1) \
CHOSEN_TORCH_CUDA_ARCH_LIST='9.0' \
;; \
12.8.1|12.9.1) \
12.8.1|12.9.1|13.0.1) \
CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0' \
;; \
*) \
Expand Down
10 changes: 5 additions & 5 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ dependencies = [
"py-spy",
"pybase64",
"pydantic",
"pynvml",
"nvidia-ml-py",
"python-multipart",
"pyzmq>=25.1.2",
"requests",
Expand All @@ -57,10 +57,10 @@ dependencies = [
"soundfile==0.13.1",
"tiktoken",
"timm==1.0.16",
"torch==2.8.0",
"torch>=2.8.0",
"torch_memory_saver==0.0.9rc2",
"torchao==0.9.0",
"torchaudio==2.8.0",
"torchao==0.13.0",
"torchaudio>=2.8.0",
"torchvision",
"tqdm",
"transformers==4.57.0",
Expand All @@ -72,7 +72,7 @@ dependencies = [
]

[project.optional-dependencies]
decord = ["decord"]
decord = ["decord2"]
test = [
"accelerate",
"expecttest",
Expand Down
2 changes: 1 addition & 1 deletion python/pyproject_other.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ runtime_common = [
"psutil",
"pybase64",
"pydantic",
"pynvml",
"nvidia-ml-py",
"python-multipart",
"pyzmq>=25.1.2",
"scipy",
Expand Down
6 changes: 5 additions & 1 deletion python/sglang/srt/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,11 @@ def get_available_gpu_memory(

if empty_cache:
torch.cuda.empty_cache()
free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
shared_sysmem_device_mem_sms = (87, 110, 121) # Orin, Thor, Spark
if get_device_sm() in shared_sysmem_device_mem_sms:
free_gpu_memory = psutil.virtual_memory().available
else:
free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)

elif device == "xpu":
num_gpus = torch.xpu.device_count()
Expand Down
12 changes: 6 additions & 6 deletions scripts/ci/ci_install_deepep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set -euxo pipefail

bash scripts/ci/ci_install_dependency.sh

export GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
export GDRCOPY_HOME=/usr/src/gdrdrv-2.5.1/
export NVSHMEM_DIR=/opt/nvshmem/install
export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH"
export PATH="${NVSHMEM_DIR}/bin:$PATH"
Expand All @@ -23,9 +23,9 @@ rm -rf /opt/gdrcopy && mkdir -p /opt/gdrcopy
rm -rf /opt/nvshmem && mkdir -p /opt/nvshmem
cd /opt/gdrcopy
git clone https://github.com/NVIDIA/gdrcopy.git .
git checkout v2.4.4
git checkout v2.5.1
apt update
apt install -y nvidia-dkms-535
apt install -y nvidia-dkms-580
apt install -y build-essential devscripts debhelper fakeroot pkg-config dkms
apt install -y check libsubunit0 libsubunit-dev python3-venv
cd packages
Expand All @@ -42,8 +42,8 @@ apt-get update && apt-get install -y libfabric-dev

# Install NVSHMEM
cd /opt/nvshmem
wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz
wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.4.5/source/nvshmem_src_cuda12-all-all-3.4.5.tar.gz
tar -xf nvshmem_src_cuda12-all-all-3.4.5.tar.gz
mv nvshmem_src nvshmem && cd nvshmem
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
Expand All @@ -53,7 +53,7 @@ NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/opt/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/opt/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES="90;100;103;121"
cd build
make -j$(nproc) install

Expand Down
4 changes: 2 additions & 2 deletions scripts/ci/ci_install_dependency.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -euxo pipefail

IS_BLACKWELL=${IS_BLACKWELL:-0}
CU_VERSION="cu128"
CU_VERSION="cu129"

# Kill existing processes
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
Expand Down Expand Up @@ -64,7 +64,7 @@ $PIP_CMD install mooncake-transfer-engine==0.3.6.post1 nvidia-cuda-nvrtc-cu12 py

if [ "$IS_BLACKWELL" != "1" ]; then
# For lmms_evals evaluating MMMU
git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
git clone --branch v0.5 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
$PIP_CMD install -e lmms-eval/ $PIP_INSTALL_SUFFIX

# Install xformers
Expand Down
27 changes: 19 additions & 8 deletions sgl-kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ FetchContent_Populate(repo-flashinfer)
FetchContent_Declare(
repo-flash-attention
GIT_REPOSITORY https://github.com/sgl-project/sgl-attn
GIT_TAG f9af0c2a1d82ab1812e6987e9338363cc2bf0f8d
GIT_TAG sgl-kernel
GIT_SHALLOW OFF
)
FetchContent_Populate(repo-flash-attention)
Expand All @@ -99,7 +99,7 @@ FetchContent_Populate(repo-flash-attention)
FetchContent_Declare(
repo-flash-attention-origin
GIT_REPOSITORY https://github.com/Dao-AILab/flash-attention.git
GIT_TAG 203b9b3dba39d5d08dffb49c09aa622984dff07d
GIT_TAG c485eeade0c3ec9ce186c3640c52c9f1ce090b81
GIT_SHALLOW OFF
)
FetchContent_Populate(repo-flash-attention-origin)
Expand Down Expand Up @@ -218,26 +218,37 @@ if (ENABLE_BELOW_SM90)
"-gencode=arch=compute_80,code=sm_80"
"-gencode=arch=compute_89,code=sm_89"
)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
list(APPEND SGL_KERNEL_CUDA_FLAGS
"-gencode=arch=compute_87,code=sm_87"
)
endif()

endif()

if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.8" OR SGL_KERNEL_ENABLE_SM100A)
list(APPEND SGL_KERNEL_CUDA_FLAGS
"-gencode=arch=compute_100a,code=sm_100a"
"-gencode=arch=compute_120a,code=sm_120a"
)

# refer sm_121, sm_110 and sm_101 description https://github.com/pytorch/pytorch/pull/156176
if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "13.0")
list(APPEND SGL_KERNEL_CUDA_FLAGS
"-gencode=arch=compute_103a,code=sm_103a"
"-gencode=arch=compute_110a,code=sm_110a"
"-gencode=arch=compute_121a,code=sm_121a"
"--compress-mode=size"
)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
list(APPEND SGL_KERNEL_CUDA_FLAGS
"-gencode=arch=compute_110a,code=sm_110a"
"-gencode=arch=compute_121a,code=sm_121a"
)
endif()
else()
list(APPEND SGL_KERNEL_CUDA_FLAGS
"-gencode=arch=compute_101a,code=sm_101a"
)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
list(APPEND SGL_KERNEL_CUDA_FLAGS
"-gencode=arch=compute_101a,code=sm_101a"
)
endif()
endif()
endif()

Expand Down
7 changes: 6 additions & 1 deletion sgl-kernel/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ else
BUILDER_NAME="pytorch/manylinux2_28-builder"
fi

if [ ${CUDA_VERSION} = "12.9" ]; then
if [ ${CUDA_VERSION} = "13.0" ]; then
DOCKER_IMAGE="${BUILDER_NAME}:cuda${CUDA_VERSION}"
TORCH_INSTALL="pip install --no-cache-dir torch==2.9.0 --index-url https://download.pytorch.org/whl/test/cu130"
elif [ ${CUDA_VERSION} = "12.9" ]; then
DOCKER_IMAGE="${BUILDER_NAME}:cuda${CUDA_VERSION}"
TORCH_INSTALL="pip install --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu129"
elif [ ${CUDA_VERSION} = "12.8" ]; then
Expand Down Expand Up @@ -67,6 +70,8 @@ docker run --rm \
export CUDA_VERSION=${CUDA_VERSION} && \
mkdir -p /usr/lib/${ARCH}-linux-gnu/ && \
ln -s /usr/local/cuda-${CUDA_VERSION}/targets/${LIBCUDA_ARCH}-linux/lib/stubs/libcuda.so /usr/lib/${ARCH}-linux-gnu/libcuda.so && \
export CPLUS_INCLUDE_PATH=/usr/local/cuda/include/cccl${CPLUS_INCLUDE_PATH:+:${CPLUS_INCLUDE_PATH}} && \
export C_INCLUDE_PATH=/usr/local/cuda/include/cccl${C_INCLUDE_PATH:+:${C_INCLUDE_PATH}} && \

cd /sgl-kernel && \
ls -la ${PYTHON_ROOT_PATH}/lib/python${PYTHON_VERSION}/site-packages/wheel/ && \
Expand Down
Loading