diff --git a/kernels/cuda/cutlass_gemm/Dockerfile b/kernels/cuda/cutlass_gemm/Dockerfile new file mode 100644 index 0000000..d1bccd0 --- /dev/null +++ b/kernels/cuda/cutlass_gemm/Dockerfile @@ -0,0 +1,31 @@ +# To build the image, run the following command: +# docker build -t cutlass_gemm . +# To run the image, run the following command: +# docker run --gpus all --rm -ti --ipc=host --name gpu_cutlass_gemm_instance cutlass_gemm /bin/bash + +FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel + +# Install common dependencies and utilities +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + wget \ + sudo \ + build-essential \ + curl \ + git \ + && rm -rf /var/lib/apt/lists/* + +# Set the working directory +COPY ./ /workspace +WORKDIR /workspace +ENV PYTHONPATH /workspace:$PYTHONPATH + +# Clone the cutlass repository +RUN git clone https://github.com/NVIDIA/cutlass.git /workspace/cutlass +RUN cd /workspace/cutlass && git checkout 06b21349bcf6ddf6a1686a47a137ad1446579db9 +# Install cutlass +RUN cd /workspace/cutlass && mkdir -p build +RUN cd /workspace/cutlass/build && cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON + +# Install cutlass gemm +RUN cd /workspace/ && pip install -e . \ No newline at end of file diff --git a/kernels/cuda/cutlass_gemm/readme.md b/kernels/cuda/cutlass_gemm/readme.md index dab43a0..29bfeca 100644 --- a/kernels/cuda/cutlass_gemm/readme.md +++ b/kernels/cuda/cutlass_gemm/readme.md @@ -1,2 +1,37 @@ -Currently the CPP extension builds with Cutlass 3.5.1 (credit to @SamirMoustafa for the update). -3.6 will fail atm due to a refactor in the TMA descriptor. +# CUTLASS FP8 GEMM + +This project uses NVIDIA's CUTLASS library with Ping-Pong kernel on Hopper architecture design for efficient GPU-based GEMM. [learn more](https://pytorch.org/blog/cutlass-ping-pong-gemm-kernel/) +## Installation + +- Prerequisites: NVIDIA Hopper GPU with CUDA support + +### Without Docker +```bash +# 1. Clone the CUTLASS repository +git clone https://github.com/NVIDIA/cutlass.git +cd cutlass +git checkout 06b21349bcf6ddf6a1686a47a137ad1446579db9 + +# 2. Build CUTLASS +mkdir build && cd build +cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON + +# 3. Install the Python package +cd ../../ && pip install -e . + +# 4. Run the test script +python test_cutlass_gemm.py +``` + +### With Docker +```bash +# 1. Build the Docker image +docker build -t cutlass_gemm . + +# 2. Run the Docker container +docker run --gpus all --rm -ti --ipc=host --name gpu_cutlass_gemm_instance cutlass_gemm /bin/bash + +# 3. Inside the container, run the test script +python test_cutlass_gemm.py +``` + diff --git a/kernels/cuda/cutlass_gemm/setup.py b/kernels/cuda/cutlass_gemm/setup.py index eda350d..4ca4d33 100644 --- a/kernels/cuda/cutlass_gemm/setup.py +++ b/kernels/cuda/cutlass_gemm/setup.py @@ -1,5 +1,8 @@ +import os from setuptools import setup -from torch.utils.cpp_extension import BuildExtension, CUDAExtension +from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME + +current_location = os.path.abspath(os.path.dirname(__file__)) setup( name='cutlass_gemm', @@ -23,11 +26,11 @@ ] }, include_dirs=[ - '/home/adhoq26/cutlass/include', - '/home/adhoq26/cutlass/tools/util/include', + f'{current_location}/cutlass/include', + f'{current_location}/cutlass/tools/util/include', ], libraries=['cuda'], - library_dirs=['/usr/local/cuda-12.4/lib64'], + library_dirs=[os.path.join(CUDA_HOME, 'lib64')], ) ], cmdclass={ diff --git a/kernels/cuda/cutlass_gemm/test_cutlass_gemm.py b/kernels/cuda/cutlass_gemm/test_cutlass_gemm.py index d722ff2..f773ca0 100644 --- a/kernels/cuda/cutlass_gemm/test_cutlass_gemm.py +++ b/kernels/cuda/cutlass_gemm/test_cutlass_gemm.py @@ -1,5 +1,5 @@ -from pingpong_gemm import cutlass_scaled_mm import torch +from pingpong_gemm import cutlass_scaled_mm m, k, n = 16, 4096, 4096 dtype = torch.float8_e4m3fn