|
1 |
| -# Start from released image |
2 |
| -FROM quay.io/opendatahub/vllm:cuda-pr-182 as vllm-grpc-adapter |
| 1 | +## Global Args ################################################################# |
| 2 | +ARG BASE_UBI_IMAGE_TAG=9.4 |
| 3 | +ARG PYTHON_VERSION=3.12 |
3 | 4 |
|
4 |
| -USER root |
| 5 | +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" |
| 6 | +ARG vllm_fa_cmake_gpu_arches='80-real;90-real' |
| 7 | + |
| 8 | +## Base Layer ################################################################## |
| 9 | +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base |
| 10 | +ARG PYTHON_VERSION |
| 11 | +ENV PYTHON_VERSION=${PYTHON_VERSION} |
| 12 | +RUN microdnf -y update && microdnf install -y \ |
| 13 | + python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \ |
| 14 | + && microdnf clean all |
| 15 | + |
| 16 | +WORKDIR /workspace |
| 17 | + |
| 18 | +ENV LANG=C.UTF-8 \ |
| 19 | + LC_ALL=C.UTF-8 |
| 20 | + |
| 21 | +# Some utils for dev purposes - tar required for kubectl cp |
| 22 | +RUN microdnf install -y \ |
| 23 | + which procps findutils tar vim git\ |
| 24 | + && microdnf clean all |
| 25 | + |
| 26 | + |
| 27 | +## Python Installer ############################################################ |
| 28 | +FROM base as python-install |
| 29 | +ARG PYTHON_VERSION |
| 30 | + |
| 31 | +ENV VIRTUAL_ENV=/opt/vllm |
| 32 | +ENV PATH="$VIRTUAL_ENV/bin:$PATH" |
| 33 | +ENV PYTHON_VERSION=${PYTHON_VERSION} |
| 34 | +RUN microdnf install -y \ |
| 35 | + python${PYTHON_VERSION}-devel && \ |
| 36 | + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all |
| 37 | + |
| 38 | + |
| 39 | +## CUDA Base ################################################################### |
| 40 | +FROM python-install as cuda-base |
| 41 | + |
| 42 | +RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \ |
| 43 | + https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo |
| 44 | + |
| 45 | +RUN microdnf install -y \ |
| 46 | + cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \ |
| 47 | + microdnf clean all |
| 48 | + |
| 49 | +ENV CUDA_HOME="/usr/local/cuda" \ |
| 50 | + PATH="${CUDA_HOME}/bin:${PATH}" \ |
| 51 | + LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}" |
| 52 | + |
| 53 | +## Python cuda base ################################################################# |
| 54 | +FROM cuda-base AS python-cuda-base |
| 55 | + |
| 56 | +ENV VIRTUAL_ENV=/opt/vllm |
| 57 | +ENV PATH="$VIRTUAL_ENV/bin:$PATH" |
| 58 | + |
| 59 | +# install cuda and common dependencies |
| 60 | +RUN --mount=type=cache,target=/root/.cache/pip \ |
| 61 | + --mount=type=cache,target=/root/.cache/uv \ |
| 62 | + --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ |
| 63 | + --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ |
| 64 | + uv pip install \ |
| 65 | + -r requirements-cuda.txt |
| 66 | + |
| 67 | + |
| 68 | +## Development ################################################################# |
| 69 | +FROM python-cuda-base AS dev |
| 70 | + |
| 71 | +# install build and runtime dependencies |
| 72 | +RUN --mount=type=cache,target=/root/.cache/pip \ |
| 73 | + --mount=type=cache,target=/root/.cache/uv \ |
| 74 | + --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ |
| 75 | + --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ |
| 76 | + --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \ |
| 77 | + --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \ |
| 78 | + --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \ |
| 79 | + uv pip install \ |
| 80 | + -r requirements-cuda.txt \ |
| 81 | + -r requirements-dev.txt |
| 82 | + |
| 83 | +## Builder ##################################################################### |
| 84 | +FROM dev AS build |
| 85 | + |
| 86 | +# install build dependencies |
| 87 | +RUN --mount=type=cache,target=/root/.cache/pip \ |
| 88 | + --mount=type=cache,target=/root/.cache/uv \ |
| 89 | + --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \ |
| 90 | + uv pip install -r requirements-build.txt |
| 91 | + |
| 92 | +# install compiler cache to speed up compilation leveraging local or remote caching |
| 93 | +# git is required for the cutlass kernels |
| 94 | +RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y git ccache && microdnf clean all |
| 95 | + |
| 96 | +COPY . . |
| 97 | + |
| 98 | +ARG TORCH_CUDA_ARCH_LIST |
| 99 | +ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST |
| 100 | +ARG vllm_fa_cmake_gpu_arches |
| 101 | +ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} |
5 | 102 |
|
6 |
| -# Copy source code changes into the installed location to overwrite the installed python code |
7 |
| -COPY vllm /opt/vllm/lib64/python3.12/site-packages/vllm |
| 103 | +# max jobs used by Ninja to build extensions |
| 104 | +ARG max_jobs=2 |
| 105 | +ENV MAX_JOBS=${max_jobs} |
| 106 | +# number of threads used by nvcc |
| 107 | +ARG nvcc_threads=8 |
| 108 | +ENV NVCC_THREADS=$nvcc_threads |
| 109 | +# make sure punica kernels are built (for LoRA) |
| 110 | +ENV VLLM_INSTALL_PUNICA_KERNELS=1 |
| 111 | + |
| 112 | +# Make sure the cuda environment is in the PATH |
| 113 | +ENV PATH=/usr/local/cuda/bin:$PATH |
| 114 | + |
| 115 | +ENV CCACHE_DIR=/root/.cache/ccache |
| 116 | +RUN --mount=type=cache,target=/root/.cache/ccache \ |
| 117 | + --mount=type=cache,target=/root/.cache/pip \ |
| 118 | + --mount=type=cache,target=/root/.cache/uv \ |
| 119 | + --mount=type=bind,src=.git,target=/workspace/.git \ |
| 120 | + env CFLAGS="-march=haswell" \ |
| 121 | + CXXFLAGS="$CFLAGS $CXXFLAGS" \ |
| 122 | + CMAKE_BUILD_TYPE=Release \ |
| 123 | + python3 setup.py bdist_wheel --dist-dir=dist |
| 124 | + |
| 125 | +#################### libsodium Build IMAGE #################### |
| 126 | +FROM base as libsodium-builder |
| 127 | + |
| 128 | +RUN microdnf install -y gcc gzip \ |
| 129 | + && microdnf clean all |
| 130 | + |
| 131 | +WORKDIR /usr/src/libsodium |
| 132 | + |
| 133 | +ARG LIBSODIUM_VERSION=1.0.20 |
| 134 | +RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \ |
| 135 | + && tar -xzvf libsodium*.tar.gz \ |
| 136 | + && rm -f libsodium*.tar.gz \ |
| 137 | + && mv libsodium*/* ./ |
| 138 | + |
| 139 | +RUN CFLAGS="-O3 -Wall -Werror=format-security -Wno-unused-function -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection"\ |
| 140 | + ./configure --prefix="/usr/" && make -j $MAX_JOBS && make check |
| 141 | + |
| 142 | +## Release ##################################################################### |
| 143 | +FROM python-install AS vllm-openai |
| 144 | +ARG PYTHON_VERSION |
| 145 | + |
| 146 | +WORKDIR /workspace |
| 147 | + |
| 148 | +ENV VIRTUAL_ENV=/opt/vllm |
| 149 | +ENV PATH=$VIRTUAL_ENV/bin/:$PATH |
| 150 | + |
| 151 | +# force using the python venv's cuda runtime libraries |
| 152 | +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}" |
| 153 | +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}" |
| 154 | +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}" |
| 155 | + |
| 156 | +# Triton needs a CC compiler |
| 157 | +RUN microdnf install -y gcc \ |
| 158 | + && microdnf clean all |
| 159 | + |
| 160 | +# install vllm wheel first, so that torch etc will be installed |
| 161 | +RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ |
| 162 | + --mount=type=cache,target=/root/.cache/pip \ |
| 163 | + --mount=type=cache,target=/root/.cache/uv \ |
| 164 | + uv pip install $(echo dist/*.whl)'[tensorizer]' --verbose |
| 165 | + |
| 166 | +# Install libsodium for Tensorizer encryption |
| 167 | +RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \ |
| 168 | + cd /usr/src/libsodium \ |
| 169 | + && make install |
| 170 | + |
| 171 | +RUN --mount=type=cache,target=/root/.cache/pip \ |
| 172 | + --mount=type=cache,target=/root/.cache/uv \ |
| 173 | + uv pip install \ |
| 174 | + "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp312-cp312-linux_x86_64.whl" |
| 175 | + |
| 176 | +ENV HF_HUB_OFFLINE=1 \ |
| 177 | + HOME=/home/vllm \ |
| 178 | + # Allow requested max length to exceed what is extracted from the |
| 179 | + # config.json |
| 180 | + # see: https://github.com/vllm-project/vllm/pull/7080 |
| 181 | + VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ |
| 182 | + VLLM_USAGE_SOURCE=production-docker-image \ |
| 183 | + VLLM_WORKER_MULTIPROC_METHOD=fork \ |
| 184 | + VLLM_NO_USAGE_STATS=1 |
| 185 | + |
| 186 | +# setup non-root user for OpenShift |
| 187 | +RUN umask 002 \ |
| 188 | + && useradd --uid 2000 --gid 0 vllm \ |
| 189 | + && chmod g+rwx $HOME /usr/src /workspace |
| 190 | + |
| 191 | +COPY LICENSE /licenses/vllm.md |
| 192 | + |
| 193 | +# Copy only .jinja files from example directory to template directory |
| 194 | +COPY examples/*.jinja /app/data/template/ |
| 195 | + |
| 196 | +USER 2000 |
| 197 | +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] |
| 198 | + |
| 199 | + |
| 200 | +FROM vllm-openai as vllm-grpc-adapter |
| 201 | + |
| 202 | +USER root |
8 | 203 |
|
9 | 204 | # RUN --mount=type=cache,target=/root/.cache/pip \
|
10 | 205 | # pip install vllm-tgis-adapter==0.5.1
|
|
0 commit comments