diff --git a/kernels/cuda/cutlass_gemm/Dockerfile b/kernels/cuda/cutlass_gemm/Dockerfile
new file mode 100644
index 0000000..d1bccd0
--- /dev/null
+++ b/kernels/cuda/cutlass_gemm/Dockerfile
@@ -0,0 +1,31 @@
+# To build the image, run the following command:
+# docker build -t cutlass_gemm .
+# To run the image, run the following command:
+# docker run --gpus all --rm -ti --ipc=host --name gpu_cutlass_gemm_instance cutlass_gemm /bin/bash
+
+FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
+
+# Install common dependencies and utilities
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ca-certificates \
+        wget \
+        sudo \
+        build-essential \
+        curl \
+        git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set the working directory
+COPY ./ /workspace
+WORKDIR /workspace
+ENV PYTHONPATH /workspace:$PYTHONPATH
+
+# Clone the cutlass repository
+RUN git clone https://github.com/NVIDIA/cutlass.git /workspace/cutlass
+RUN cd /workspace/cutlass && git checkout 06b21349bcf6ddf6a1686a47a137ad1446579db9
+# Install cutlass
+RUN cd /workspace/cutlass && mkdir -p build
+RUN cd /workspace/cutlass/build && cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON
+
+# Install cutlass gemm
+RUN cd /workspace/ && pip install -e .
\ No newline at end of file
diff --git a/kernels/cuda/cutlass_gemm/readme.md b/kernels/cuda/cutlass_gemm/readme.md
index dab43a0..29bfeca 100644
--- a/kernels/cuda/cutlass_gemm/readme.md
+++ b/kernels/cuda/cutlass_gemm/readme.md
@@ -1,2 +1,37 @@
-Currently the CPP extension builds with Cutlass 3.5.1 (credit to @SamirMoustafa for the update).  
-3.6 will fail atm due to a refactor in the TMA descriptor.  
+# CUTLASS FP8 GEMM
+
+This project uses NVIDIA's CUTLASS library with Ping-Pong kernel on Hopper architecture design for efficient GPU-based GEMM.  [learn more](https://pytorch.org/blog/cutlass-ping-pong-gemm-kernel/)
+## Installation
+
+- Prerequisites: NVIDIA Hopper GPU with CUDA support
+
+### Without Docker
+```bash
+# 1. Clone the CUTLASS repository
+git clone https://github.com/NVIDIA/cutlass.git
+cd cutlass
+git checkout 06b21349bcf6ddf6a1686a47a137ad1446579db9
+
+# 2. Build CUTLASS
+mkdir build && cd build
+cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON
+
+# 3. Install the Python package
+cd ../../ && pip install -e .
+
+# 4. Run the test script
+python test_cutlass_gemm.py
+```
+
+### With Docker
+```bash
+# 1. Build the Docker image
+docker build -t cutlass_gemm .
+
+# 2. Run the Docker container
+docker run --gpus all --rm -ti --ipc=host --name gpu_cutlass_gemm_instance cutlass_gemm /bin/bash
+
+# 3. Inside the container, run the test script
+python test_cutlass_gemm.py
+```
+
diff --git a/kernels/cuda/cutlass_gemm/setup.py b/kernels/cuda/cutlass_gemm/setup.py
index eda350d..4ca4d33 100644
--- a/kernels/cuda/cutlass_gemm/setup.py
+++ b/kernels/cuda/cutlass_gemm/setup.py
@@ -1,5 +1,8 @@
+import os
 from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
+
+current_location = os.path.abspath(os.path.dirname(__file__))
 
 setup(
     name='cutlass_gemm',
@@ -23,11 +26,11 @@
                 ]
             },
             include_dirs=[
-                '/home/adhoq26/cutlass/include',
-                '/home/adhoq26/cutlass/tools/util/include',
+                f'{current_location}/cutlass/include',
+                f'{current_location}/cutlass/tools/util/include',
             ],
             libraries=['cuda'],
-            library_dirs=['/usr/local/cuda-12.4/lib64'],
+            library_dirs=[os.path.join(CUDA_HOME, 'lib64')],
         )
     ],
     cmdclass={
diff --git a/kernels/cuda/cutlass_gemm/test_cutlass_gemm.py b/kernels/cuda/cutlass_gemm/test_cutlass_gemm.py
index d722ff2..f773ca0 100644
--- a/kernels/cuda/cutlass_gemm/test_cutlass_gemm.py
+++ b/kernels/cuda/cutlass_gemm/test_cutlass_gemm.py
@@ -1,5 +1,5 @@
-from pingpong_gemm import cutlass_scaled_mm
 import torch
+from pingpong_gemm import cutlass_scaled_mm
 
 m, k, n = 16, 4096, 4096
 dtype = torch.float8_e4m3fn