Merge pull request #1741 from containers/vllm-cuda

rhatdan · web-flow · commit 6da1e9c32597 · 2025-07-25T06:24:33.000-04:00
CUDA vLLM variant
diff --git a/container-images/cuda-vllm/Containerfile b/container-images/cuda-vllm/Containerfile
@@ -0,0 +1,17 @@
+ARG PARENT=quay.io/ramalama/cuda:latest
+FROM $PARENT
+
+ENV UV_PYTHON_INSTALL_DIR="/opt/uv/python"
+ENV VIRTUAL_ENV="/opt/venv"
+ENV PATH="$VIRTUAL_ENV/bin:/root/.local/bin:$PATH"
+
+ENV UV_HTTP_TIMEOUT=500
+
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE="copy"
+
+COPY . /src/ramalama
+WORKDIR /src/ramalama
+RUN container-images/scripts/build-vllm.sh "cuda"
+WORKDIR /
+
diff --git a/container-images/cuda/Containerfile b/container-images/cuda/Containerfile
@@ -1,13 +1,13 @@
-ARG VERSION=12.8.1
+ARG CUDA_VERSION=12.8.1
 # Base image with CUDA for compilation
-FROM docker.io/nvidia/cuda:${VERSION}-devel-ubi9 AS builder
+FROM docker.io/nvidia/cuda:${CUDA_VERSION}-devel-ubi9 AS builder
 
 COPY . /src/ramalama
 WORKDIR /src/ramalama
 RUN container-images/scripts/build_llama_and_whisper.sh cuda
 
 # Final runtime image
-FROM docker.io/nvidia/cuda:${VERSION}-runtime-ubi9
+FROM docker.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubi9
 
 # Copy the entire installation directory from the builder
 COPY --from=builder /tmp/install /usr
diff --git a/container-images/ramalama-vllm/Containerfile b/container-images/ramalama-vllm/Containerfile
@@ -12,6 +12,6 @@ ENV UV_LINK_MODE="copy"
 
 COPY . /src/ramalama
 WORKDIR /src/ramalama
-RUN container-images/scripts/build-vllm.sh
+RUN container-images/scripts/build-vllm.sh "ramalama"
 WORKDIR /
 
diff --git a/container-images/scripts/build-vllm.sh b/container-images/scripts/build-vllm.sh
@@ -4,13 +4,51 @@ available() {
   command -v "$1" >/dev/null
 }
 
-install_deps() {
-  set -eux -o pipefail
+is_rhel_based() { # doesn't include openEuler
+  # shellcheck disable=SC1091
+  source /etc/os-release
+  [ "$ID" = "rhel" ] || [ "$ID" = "redhat" ] || [ "$ID" == "centos" ]
+}
 
+dnf_install_epel() {
+  local rpm_exclude_list="selinux-policy,container-selinux"
+  local url="https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm"
+  dnf reinstall -y "$url" || dnf install -y "$url" --exclude "$rpm_exclude_list"
+  crb enable # this is in epel-release, can only install epel-release via url
+}
+
+add_stream_repo() {
+  local url="https://mirror.stream.centos.org/9-stream/$1/$uname_m/os/"
+  dnf config-manager --add-repo "$url"
+  url="http://mirror.centos.org/centos/RPM-GPG-KEY-CentOS-Official"
+  local file="/etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-Official"
+  if [ ! -e $file ]; then
+    curl --retry 8 --retry-all-errors -o $file "$url"
+    rpm --import $file
+  fi
+}
+
+rm_non_ubi_repos() {
+  local dir="/etc/yum.repos.d"
+  rm -rf $dir/mirror.stream.centos.org_9-stream_* $dir/epel*
+}
+
+install_deps() {
   if available dnf; then
-    dnf install -y git curl wget ca-certificates gcc gcc-c++ \
-      gperftools-libs numactl-devel ffmpeg libSM libXext mesa-libGL jq lsof \
-      vim numactl
+    dnf install -y git wget ca-certificates gcc gcc-c++ libSM libXext \
+      mesa-libGL jq lsof vim numactl
+    if is_rhel_based; then
+      add_stream_repo "AppStream"
+      dnf install -y numactl-devel
+      rm_non_ubi_repos
+
+      dnf_install_epel
+      dnf install -y gperftools-libs
+      rm_non_ubi_repos
+    else
+      dnf install -y numactl-devel gperftools-libs
+    fi
+
     dnf -y clean all
     rm -rf /var/cache/*dnf*
   elif available apt-get; then
@@ -25,25 +63,33 @@ install_deps() {
 }
 
 preload_and_ulimit() {
-  local ld_preload_file="libtcmalloc_minimal.so.4"
-  local ld_preload_file_1="/usr/lib/$arch-linux-gnu/$ld_preload_file"
-  local ld_preload_file_2="/usr/lib64/$ld_preload_file"
-  if [ -e "$ld_preload_file_1" ]; then
-    ld_preload_file="$ld_preload_file_1"
-  elif [ -e "$ld_preload_file_2" ]; then
-    ld_preload_file="$ld_preload_file_2"
+  if [ "$containerfile" = "ramalama" ]; then
+    local ld_preload_file="libtcmalloc_minimal.so.4"
+    local ld_preload_file_1="/usr/lib/$uname_m-linux-gnu/$ld_preload_file"
+    local ld_preload_file_2="/usr/lib64/$ld_preload_file"
+    if [ -e "$ld_preload_file_1" ]; then
+      ld_preload_file="$ld_preload_file_1"
+    elif [ -e "$ld_preload_file_2" ]; then
+      ld_preload_file="$ld_preload_file_2"
+    fi
+
+    if [ -e "$ld_preload_file" ]; then
+      echo "LD_PRELOAD=$ld_preload_file" >> /etc/environment
+    fi
+
+    echo 'ulimit -c 0' >> ~/.bashrc
   fi
-
-  if [ -e "$ld_preload_file" ]; then
-    echo "LD_PRELOAD=$ld_preload_file" >> /etc/environment
-  fi
-
-  echo 'ulimit -c 0' >> ~/.bashrc
 }
 
 pip_install() {
-  local url="https://download.pytorch.org/whl/cpu"
-  uv pip install -v -r "$1" --extra-index-url $url
+  local url="https://download.pytorch.org/whl"
+  if [ "$containerfile" = "ramalama" ]; then
+    url="$url/cpu"
+  elif [ "$containerfile" = "cuda" ]; then
+    url="$url/cu$(echo "$CUDA_VERSION" | cut -d. -f1,2 | tr -d '.')"
+  fi
+
+  uv pip install -v -r "$1" --extra-index-url "$url"
 }
 
 git_clone_specific_commit() {
@@ -55,33 +101,51 @@ git_clone_specific_commit() {
   git reset --hard $commit
 }
 
+pip_install_all() {
+  if [ "$containerfile" = "ramalama" ]; then
+    pip_install requirements/cpu-build.txt
+    pip_install requirements/cpu.txt
+  elif [ "$containerfile" = "cuda" ]; then
+    pip_install requirements/cuda.txt
+  fi
+}
+
 main() {
   set -eux -o pipefail
 
-  install_deps
+  local containerfile=$1
+  if [ "$containerfile" != "ramalama" ] && [ "$containerfile" != "cuda" ]; then
+    echo "First argument must be 'ramalama' or 'cuda'. Got: '$containerfile'"
+    return 1
+  fi
 
-  local arch
-  arch=$(uname -m)
-  preload_and_ulimit
+  local uname_m
+  uname_m=$(uname -m)
 
+  install_deps
+  preload_and_ulimit
   uv venv --python 3.12 --seed "$VIRTUAL_ENV"
   uv pip install --upgrade pip
 
   local vllm_url="https://github.com/vllm-project/vllm"
   local commit="ac9fb732a5c0b8e671f8c91be8b40148282bb14a"
   git_clone_specific_commit
-  if [ "$arch" == "x86_64" ]; then
-    export VLLM_CPU_DISABLE_AVX512="0"
-    export VLLM_CPU_AVX512BF16="0"
-    export VLLM_CPU_AVX512VNNI="0"
-  elif [ "$arch" == "aarch64" ]; then
-    export VLLM_CPU_DISABLE_AVX512="true"
+  if [ "$containerfile" = "ramalama" ]; then
+    export VLLM_TARGET_DEVICE="cpu"
+    if [ "$uname_m" == "x86_64" ]; then
+      export VLLM_CPU_DISABLE_AVX512="0"
+      export VLLM_CPU_AVX512BF16="0"
+      export VLLM_CPU_AVX512VNNI="0"
+    elif [ "$uname_m" == "aarch64" ]; then
+      export VLLM_CPU_DISABLE_AVX512="true"
+    fi
+  elif [ "$containerfile" = "cuda" ]; then
+    export VLLM_TARGET_DEVICE="cuda"
   fi
 
-  pip_install requirements/cpu-build.txt
-  pip_install requirements/cpu.txt
+  pip_install_all
+  MAX_JOBS=2 python3 setup.py install
 
-  MAX_JOBS=2 VLLM_TARGET_DEVICE=cpu python3 setup.py install
   cd -
   rm -rf vllm /root/.cache
 }
diff --git a/container-images/scripts/build_llama_and_whisper.sh b/container-images/scripts/build_llama_and_whisper.sh
@@ -128,7 +128,7 @@ dnf_install_ffmpeg() {
     add_stream_repo "CRB"
   fi
 
-  if [[ "${ID}" == "openEuler" ]]; then
+  if [ "${ID}" = "openEuler" ]; then
     dnf install -y ffmpeg
   else
     dnf install -y ffmpeg-free