Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 0 additions & 36 deletions .github/workflows/release-docker-blackwell.yml

This file was deleted.

47 changes: 0 additions & 47 deletions .github/workflows/release-docker-deepep.yml

This file was deleted.

17 changes: 14 additions & 3 deletions .github/workflows/release-docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,13 @@ jobs:
environment: 'prod'
strategy:
matrix:
cuda_version: ['12.4.1']
build_type: ['all']
cuda_version: ['12.6.1', '12.8.1']
build_type: ['all', 'blackwell']
exclude:
- cuda_version: '12.6.1'
build_type: 'blackwell'
- cuda_version: '12.8.1'
build_type: 'all'
steps:
- name: Delete huge unnecessary tools folder
run: rm -rf /opt/hostedtoolcache
Expand All @@ -41,6 +46,10 @@ jobs:
cuda_tag="cu124"
elif [ "${{ matrix.cuda_version }}" = "12.5.1" ]; then
cuda_tag="cu125"
elif [ "${{ matrix.cuda_version }}" = "12.6.1" ]; then
cuda_tag="cu126"
elif [ "${{ matrix.cuda_version }}" = "12.8.1" ]; then
cuda_tag="cu128"
else
echo "Unsupported CUDA version"
exit 1
Expand All @@ -52,12 +61,14 @@ jobs:
tag_suffix=""
elif [ "${{ matrix.build_type }}" = "srt" ]; then
tag_suffix="-srt"
elif [ "${{ matrix.build_type }}" = "blackwell" ]; then
tag_suffix="-blackwell"
else
echo "Unsupported build type"
exit 1
fi

docker build . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache
docker buildx build . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache
docker push lmsysorg/sglang:${tag}${tag_suffix}

if [ "${{ matrix.cuda_version }}" = "12.4.1" ]; then
Expand Down
154 changes: 130 additions & 24 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,51 +1,157 @@
ARG CUDA_VERSION=12.4.1
ARG CUDA_VERSION=12.6.1

FROM nvcr.io/nvidia/tritonserver:24.12-py3-min
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04

ARG BUILD_TYPE=all

ENV DEBIAN_FRONTEND=noninteractive
ENV CUDA_HOME=/usr/local/cuda

# TZ
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt update -y \
&& apt install software-properties-common -y \
&& apt install python3 python3-pip -y \
&& apt install curl git sudo libibverbs-dev -y \
&& apt install rdma-core infiniband-diags openssh-server perftest -y \
&& python3 --version \
&& python3 -m pip --version \
&& rm -rf /var/lib/apt/lists/* \
&& apt clean

# For openbmb/MiniCPM models
RUN pip3 install datamodel_code_generator --break-system-packages
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections

# Deps
RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common netcat-openbsd \
libopenmpi-dev libnuma1 \
lsof zsh ccache tmux htop git-lfs tree \
kmod \
unzip \
rdma-core \
infiniband-diags \
openssh-server \
perftest \
ibverbs-providers \
libibumad3 \
libibverbs1 \
libnl-3-200 \
libnl-route-3-200 \
librdmacm1 \
build-essential \
cmake \
libibverbs-dev \
libgoogle-glog-dev \
libgtest-dev \
libjsoncpp-dev \
libnuma-dev \
libibverbs-dev \
libunwind-dev \
libgoogle-glog-dev \
libpython3-dev \
libboost-all-dev \
libssl-dev \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler-grpc \
pybind11-dev \
libhiredis-dev \
pkg-config \
patchelf \
libcurl4-openssl-dev \
curl \
libczmq4 \
libczmq-dev \
libnl-route-3-dev \
libnl-3-dev \
librdmacm1 \
libhiredis-dev \
nvidia-dkms-535 \
build-essential \
devscripts \
debhelper \
fakeroot \
dkms \
check \
libsubunit0 \
libsubunit-dev \
libfabric-dev \
python3 \
python3-pip \
&& rm -rf /var/lib/apt/lists/* \
&& ln -s /usr/bin/python3 /usr/bin/python \
&& apt clean

# GDR Related
ENV GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Clarify the intended purpose of the GDRCOPY_HOME environment variable. If it's for runtime discovery, consider setting it to a path reflecting the actual installation locations.

# GDRCopy
RUN mkdir -p /tmp \
&& cd /tmp \
&& git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
&& cd /tmp/gdrcopy/packages \
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
&& dpkg -i gdrdrv-dkms_*.deb \
&& dpkg -i libgdrapi_*.deb \
&& dpkg -i gdrcopy-tests_*.deb \
&& dpkg -i gdrcopy_*.deb \
&& rm -rf /tmp/gdrcopy

# DeepEP RDMA IBGDA Related
RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so

WORKDIR /sgl-workspace

ARG CUDA_VERSION
RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six --break-system-packages --ignore-installed \
# SGLang
RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six --ignore-installed \
&& git clone --depth=1 https://github.com/sgl-project/sglang.git \
&& if [ "$CUDA_VERSION" = "12.1.1" ]; then \
export CUINDEX=121; \
&& if [ "$CUDA_VERSION" = "12.6.1" ]; then \
export CUINDEX=126; \
elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
export CUINDEX=124; \
elif [ "$CUDA_VERSION" = "12.8.1" ]; then \
export CUINDEX=124; \
elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
export CUINDEX=118; \
python3 -m pip install --no-cache-dir sgl-kernel -i https://docs.sglang.ai/whl/cu118 --break-system-packages; \
python3 -m pip install --no-cache-dir sgl-kernel -i https://docs.sglang.ai/whl/cu118 ; \
else \
echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
fi \
&& if [ "$CUDA_VERSION" = "12.4.1" ]; then \
python3 -m pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cu126 --break-system-packages; \
python3 -m pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cu126 ; \
else \
python3 -m pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cu${CUINDEX} --break-system-packages; \
python3 -m pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cu${CUINDEX} ; \
fi \
&& cd sglang \
&& python3 -m pip --no-cache-dir install -e "python[${BUILD_TYPE}]" --break-system-packages \
&& python3 -m pip --no-cache-dir install -e "python[${BUILD_TYPE}]" \
&& if [ "$CUDA_VERSION" = "12.8.1" ]; then \
python3 -m pip install nvidia-nccl-cu12==2.26.2.post1 --force-reinstall --no-deps --break-system-packages; \
python3 -m pip install nvidia-nccl-cu12==2.27.3 --force-reinstall --no-deps ; \
fi

# NVSHMEM
ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz \
&& git clone https://github.com/deepseek-ai/DeepEP.git \
&& tar -xf nvshmem_src_3.2.5-1.txz \
&& mv nvshmem_src nvshmem \
&& cd /sgl-workspace/nvshmem \
&& git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch \
&& sed -i '1i#include <unistd.h>' /sgl-workspace/nvshmem/examples/moe_shuffle.cu \
&& cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu \
&& rm -f nvshmem_src_3.2.5-1.txz \
&& cd /sgl-workspace/nvshmem \
&& NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/sgl-workspace/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The NVSHMEM build is configured with CMAKE_CUDA_ARCHITECTURES=90, limiting portability to other NVIDIA GPU architectures. Consider making CMAKE_CUDA_ARCHITECTURES a build argument.

        cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/sgl-workspace/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=${TARGET_CUDA_ARCHS:-90} \

&& cd build \
&& make install -j \
&& cd /sgl-workspace/DeepEP \
&& NVSHMEM_DIR=/sgl-workspace/nvshmem/install pip install .

# PyPi packages
RUN pip3 install --upgrade datamodel_code_generator \
mooncake_transfer_engine \
pre-commit \
pytest \
black \
isort \
icdiff \
uv \
wheel scikit-build-core
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This pip3 install command includes several development and linting tools. If this image is primarily intended for runtime deployment, consider using multi-stage builds or a separate development image to exclude these tools.


ENV DEBIAN_FRONTEND=interactive
Loading