Adlik · lyg95 · Oct 28, 2024 · Oct 31, 2024 · Nov 22, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -189,11 +189,11 @@ endif()
 # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
 # Configure it to place files in vllm/.deps, in order to play nicely with sccache.
 #
-include(FetchContent)
-get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
-file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
-set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
-message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
+#include(FetchContent)
+#get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+#file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
+#set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
+#message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 
 #
 # Define other extension targets
@@ -218,16 +218,18 @@ set(VLLM_EXT_SRC
   "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
+  include(FetchContent)
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
   set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
 
   FetchContent_Declare(
         cutlass
-        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.5.1
-        GIT_PROGRESS TRUE
+        SOURCE_DIR /root/cutlass
+        #GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+        #GIT_TAG v3.5.1
+        #GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
         # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
@@ -242,6 +244,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/aqlm/gemm_kernels.cu"
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/quantization/gguf/gguf_kernel.cu"
+    "csrc/quantization/autoquant/int4_fp16_gemm_kernels.cu"
+    "csrc/quantization/autoquant/format.cu"
+    "csrc/quantization/autoquant/gemm_s4_f16.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
@@ -286,6 +291,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
     message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
   else()
+    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
+    # build any 3x kernels
+    set(SCALED_MM_3X_ARCHS)
+
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
       message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
@@ -295,17 +304,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
                      "in CUDA target architectures")
     endif()
-
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
-    # build any 3x kernels
-    set(SCALED_MM_3X_ARCHS)
   endif()
 
   #
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
   cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS 
-    "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
+    "7.5;8.0;8.6;8.9;9.0;9.0a" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
   if (SCALED_MM_2X_ARCHS)
@@ -524,9 +529,10 @@ if(VLLM_FLASH_ATTN_SRC_DIR)
 else()
   FetchContent_Declare(
           vllm-flash-attn
-          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
-          GIT_PROGRESS TRUE
+          SOURCE_DIR /root/flash-attention
+          #GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
+          #GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
+          #GIT_PROGRESS TRUE
   )
 endif()
 

diff --git a/Dockerfile b/Dockerfile
@@ -5,35 +5,41 @@
 # docs/source/dev/dockerfile/dockerfile.rst and
 # docs/source/assets/dev/dockerfile-stages-dependency.png
 
-ARG CUDA_VERSION=12.4.1
+ARG CUDA_VERSION=12.1.0
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
-ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.12
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
+ARG CUDA_VERSION=12.1.0
+ARG PYTHON_VERSION=3.10
 ENV DEBIAN_FRONTEND=noninteractive
 
+RUN apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && echo 'Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/10no--check-valid-until \
+    && echo 'Acquire::AllowInsecureRepositories "true";' >> /etc/apt/apt.conf.d/10no--check-valid-until
+
 # Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo \
-    && add-apt-repository ppa:deadsnakes/ppa \
+
+COPY get-pip.py /get-pip.py
+
+RUN apt-get install -y tzdata \
+    && echo 'tzdata tzdata/Areas select Asia' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/Asia select Shanghai' | debconf-set-selections \
+    && apt-get install -y ccache git curl sudo \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
     && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
     && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config
+
+RUN python3 /get-pip.py \
     && python3 --version && python3 -m pip --version
 
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
 RUN apt-get install -y gcc-10 g++-10
 RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
-RUN <<EOF
-gcc --version
-EOF
+RUN gcc --version
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -71,27 +77,23 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-build.txt
 
 COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
 # max jobs used by Ninja to build extensions
-ARG max_jobs=2
+ARG max_jobs=4
 ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
-ARG nvcc_threads=8
+ARG nvcc_threads=32
 ENV NVCC_THREADS=$nvcc_threads
 
 ARG USE_SCCACHE
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
-ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_REGION_NAME=cn-north-1
 ARG SCCACHE_S3_NO_CREDENTIALS=0
 # if USE_SCCACHE is set, use sccache to speed up compilation
+COPY sccache.tar.gz sccache.tar.gz
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git \
     if [ "$USE_SCCACHE" = "1" ]; then \
         echo "Installing sccache..." \
-        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
         && tar -xzf sccache.tar.gz \
         && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
         && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
@@ -101,22 +103,32 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         && export SCCACHE_IDLE_TIMEOUT=0 \
         && export CMAKE_BUILD_TYPE=Release \
         && sccache --show-stats \
-        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
+        && SETUPTOOLS_SCM_PRETEND_VERSION="0.6.3.post1" python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
         && sccache --show-stats; \
     fi
 
 ENV CCACHE_DIR=/root/.cache/ccache
+
+RUN mkdir -p /root/cutlass
+
+COPY cutlass/ /root/cutlass
+
+
+RUN mkdir -p /root/flash-attention
+
+COPY flash-attention/ /root/flash-attention
+
+
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git  \
     if [ "$USE_SCCACHE" != "1" ]; then \
-        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
+        SETUPTOOLS_SCM_PRETEND_VERSION="0.6.3.post1" python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # Default max size of the wheel is 250MB
-ARG VLLM_MAX_SIZE_MB=250
+ARG VLLM_MAX_SIZE_MB=400
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@@ -139,33 +151,41 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
-ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.12
+ARG CUDA_VERSION=12.1.0
+ARG PYTHON_VERSION=3.10
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 
-# Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && add-apt-repository ppa:deadsnakes/ppa \
+RUN apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && echo 'Acquire::Check-Valid-Until "false";' >> /etc/apt/apt.conf.d/10no--check-valid-until \
+    && echo 'Acquire::AllowInsecureRepositories "true";' >> /etc/apt/apt.conf.d/10no--check-valid-until
+
+RUN apt-get install -y tzdata \
+    && echo 'tzdata tzdata/Areas select Asia' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/Asia select Shanghai' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && apt-get install -y ccache  git curl sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1
+
+COPY get-pip.py /get-pip.py
+
+RUN apt-get update -y && apt-cache search python3 \
+    && apt-get install -y python${PYTHON_VERSION}  python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && which python${PYTHON_VERSION} \
     && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
     && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && ls /usr/bin/python* \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config
+
+
+RUN python3 /get-pip.py \
     && python3 --version && python3 -m pip --version
 
-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
+
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
 # install vllm wheel first, so that torch etc will be installed
@@ -174,41 +194,13 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
     python3 -m pip install dist/*.whl --verbose
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+    . /etc/environment
 COPY examples examples
-#################### vLLM installation IMAGE ####################
-
-
-#################### TEST IMAGE ####################
-# image to run unit testing suite
-# note that this uses vllm installed by `pip`
-FROM vllm-base AS test
-
-ADD . /vllm-workspace/
-
-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-dev.txt
-
-# doc requires source code
-# we hide them inside `test_docs/` , so that this source code
-# will not be imported by other tests
-RUN mkdir test_docs
-RUN mv docs test_docs/
-RUN mv vllm test_docs/
-
-#################### TEST IMAGE ####################
 
-#################### OPENAI API SERVER ####################
-# openai api server alternative
 FROM vllm-base AS vllm-openai
 
-# install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
-#################### OPENAI API SERVER ####################
diff --git a/csrc/ops.h b/csrc/ops.h
@@ -115,6 +115,22 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& azp_adj,
                            c10::optional<torch::Tensor> const& azp,
                            c10::optional<torch::Tensor> const& bias);
+
+torch::Tensor autoquant_s4_f16_gemm(
+  torch::Tensor _in_feats,
+  torch::Tensor _kernel,
+  torch::Tensor _scales_zeros);
+
+void autoquant_convert_s4_k_m8(
+  torch::Tensor _weight_dest,
+  torch::Tensor _quant_scales_zeros_dest,
+  torch::Tensor _workspace,
+  torch::Tensor _quant_weight_src,
+  torch::Tensor _quant_scales,
+  torch::Tensor _quant_zeros,
+  int64_t m,
+  int64_t k,
+  int64_t group_size);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,